On the stability of canonical correlation analysis and partial least squares with application to brain-behavior associations. Helmer, M., Warrington, S., Mohammadi-Nejad, A., Ji, J. L., Howell, A., Rosand, B., Anticevic, A., Sotiropoulos, S. N., & Murray, J. D. Communications Biology, 7(1):1–15, February, 2024. Publisher: Nature Publishing Group
Paper doi abstract bibtex Associations between datasets can be discovered through multivariate methods like Canonical Correlation Analysis (CCA) or Partial Least Squares (PLS). A requisite property for interpretability and generalizability of CCA/PLS associations is stability of their feature patterns. However, stability of CCA/PLS in high-dimensional datasets is questionable, as found in empirical characterizations. To study these issues systematically, we developed a generative modeling framework to simulate synthetic datasets. We found that when sample size is relatively small, but comparable to typical studies, CCA/PLS associations are highly unstable and inaccurate; both in their magnitude and importantly in the feature pattern underlying the association. We confirmed these trends across two neuroimaging modalities and in independent datasets with n ≈ 1000 and n = 20,000, and found that only the latter comprised sufficient observations for stable mappings between imaging-derived and behavioral features. We further developed a power calculator to provide sample sizes required for stability and reliability of multivariate analyses. Collectively, we characterize how to limit detrimental effects of overfitting on CCA/PLS stability, and provide recommendations for future studies.
@article{helmer_stability_2024,
title = {On the stability of canonical correlation analysis and partial least squares with application to brain-behavior associations},
volume = {7},
copyright = {2024 The Author(s)},
issn = {2399-3642},
url = {https://www.nature.com/articles/s42003-024-05869-4},
doi = {10.1038/s42003-024-05869-4},
abstract = {Associations between datasets can be discovered through multivariate methods like Canonical Correlation Analysis (CCA) or Partial Least Squares (PLS). A requisite property for interpretability and generalizability of CCA/PLS associations is stability of their feature patterns. However, stability of CCA/PLS in high-dimensional datasets is questionable, as found in empirical characterizations. To study these issues systematically, we developed a generative modeling framework to simulate synthetic datasets. We found that when sample size is relatively small, but comparable to typical studies, CCA/PLS associations are highly unstable and inaccurate; both in their magnitude and importantly in the feature pattern underlying the association. We confirmed these trends across two neuroimaging modalities and in independent datasets with n ≈ 1000 and n = 20,000, and found that only the latter comprised sufficient observations for stable mappings between imaging-derived and behavioral features. We further developed a power calculator to provide sample sizes required for stability and reliability of multivariate analyses. Collectively, we characterize how to limit detrimental effects of overfitting on CCA/PLS stability, and provide recommendations for future studies.},
language = {en},
number = {1},
urldate = {2025-01-06},
journal = {Communications Biology},
author = {Helmer, Markus and Warrington, Shaun and Mohammadi-Nejad, Ali-Reza and Ji, Jie Lisa and Howell, Amber and Rosand, Benjamin and Anticevic, Alan and Sotiropoulos, Stamatios N. and Murray, John D.},
month = feb,
year = {2024},
note = {Publisher: Nature Publishing Group},
keywords = {Cognitive neuroscience, Computational neuroscience, Statistical methods},
pages = {1--15},
}
Downloads: 0
{"_id":"CQGjCrDeCpGrnzCs7","bibbaseid":"helmer-warrington-mohammadinejad-ji-howell-rosand-anticevic-sotiropoulos-etal-onthestabilityofcanonicalcorrelationanalysisandpartialleastsquareswithapplicationtobrainbehaviorassociations-2024","author_short":["Helmer, M.","Warrington, S.","Mohammadi-Nejad, A.","Ji, J. L.","Howell, A.","Rosand, B.","Anticevic, A.","Sotiropoulos, S. N.","Murray, J. D."],"bibdata":{"bibtype":"article","type":"article","title":"On the stability of canonical correlation analysis and partial least squares with application to brain-behavior associations","volume":"7","copyright":"2024 The Author(s)","issn":"2399-3642","url":"https://www.nature.com/articles/s42003-024-05869-4","doi":"10.1038/s42003-024-05869-4","abstract":"Associations between datasets can be discovered through multivariate methods like Canonical Correlation Analysis (CCA) or Partial Least Squares (PLS). A requisite property for interpretability and generalizability of CCA/PLS associations is stability of their feature patterns. However, stability of CCA/PLS in high-dimensional datasets is questionable, as found in empirical characterizations. To study these issues systematically, we developed a generative modeling framework to simulate synthetic datasets. We found that when sample size is relatively small, but comparable to typical studies, CCA/PLS associations are highly unstable and inaccurate; both in their magnitude and importantly in the feature pattern underlying the association. We confirmed these trends across two neuroimaging modalities and in independent datasets with n ≈ 1000 and n = 20,000, and found that only the latter comprised sufficient observations for stable mappings between imaging-derived and behavioral features. We further developed a power calculator to provide sample sizes required for stability and reliability of multivariate analyses. Collectively, we characterize how to limit detrimental effects of overfitting on CCA/PLS stability, and provide recommendations for future studies.","language":"en","number":"1","urldate":"2025-01-06","journal":"Communications Biology","author":[{"propositions":[],"lastnames":["Helmer"],"firstnames":["Markus"],"suffixes":[]},{"propositions":[],"lastnames":["Warrington"],"firstnames":["Shaun"],"suffixes":[]},{"propositions":[],"lastnames":["Mohammadi-Nejad"],"firstnames":["Ali-Reza"],"suffixes":[]},{"propositions":[],"lastnames":["Ji"],"firstnames":["Jie","Lisa"],"suffixes":[]},{"propositions":[],"lastnames":["Howell"],"firstnames":["Amber"],"suffixes":[]},{"propositions":[],"lastnames":["Rosand"],"firstnames":["Benjamin"],"suffixes":[]},{"propositions":[],"lastnames":["Anticevic"],"firstnames":["Alan"],"suffixes":[]},{"propositions":[],"lastnames":["Sotiropoulos"],"firstnames":["Stamatios","N."],"suffixes":[]},{"propositions":[],"lastnames":["Murray"],"firstnames":["John","D."],"suffixes":[]}],"month":"February","year":"2024","note":"Publisher: Nature Publishing Group","keywords":"Cognitive neuroscience, Computational neuroscience, Statistical methods","pages":"1–15","bibtex":"@article{helmer_stability_2024,\n\ttitle = {On the stability of canonical correlation analysis and partial least squares with application to brain-behavior associations},\n\tvolume = {7},\n\tcopyright = {2024 The Author(s)},\n\tissn = {2399-3642},\n\turl = {https://www.nature.com/articles/s42003-024-05869-4},\n\tdoi = {10.1038/s42003-024-05869-4},\n\tabstract = {Associations between datasets can be discovered through multivariate methods like Canonical Correlation Analysis (CCA) or Partial Least Squares (PLS). A requisite property for interpretability and generalizability of CCA/PLS associations is stability of their feature patterns. However, stability of CCA/PLS in high-dimensional datasets is questionable, as found in empirical characterizations. To study these issues systematically, we developed a generative modeling framework to simulate synthetic datasets. We found that when sample size is relatively small, but comparable to typical studies, CCA/PLS associations are highly unstable and inaccurate; both in their magnitude and importantly in the feature pattern underlying the association. We confirmed these trends across two neuroimaging modalities and in independent datasets with n ≈ 1000 and n = 20,000, and found that only the latter comprised sufficient observations for stable mappings between imaging-derived and behavioral features. We further developed a power calculator to provide sample sizes required for stability and reliability of multivariate analyses. Collectively, we characterize how to limit detrimental effects of overfitting on CCA/PLS stability, and provide recommendations for future studies.},\n\tlanguage = {en},\n\tnumber = {1},\n\turldate = {2025-01-06},\n\tjournal = {Communications Biology},\n\tauthor = {Helmer, Markus and Warrington, Shaun and Mohammadi-Nejad, Ali-Reza and Ji, Jie Lisa and Howell, Amber and Rosand, Benjamin and Anticevic, Alan and Sotiropoulos, Stamatios N. and Murray, John D.},\n\tmonth = feb,\n\tyear = {2024},\n\tnote = {Publisher: Nature Publishing Group},\n\tkeywords = {Cognitive neuroscience, Computational neuroscience, Statistical methods},\n\tpages = {1--15},\n}\n\n\n\n\n\n\n\n\n\n\n\n","author_short":["Helmer, M.","Warrington, S.","Mohammadi-Nejad, A.","Ji, J. L.","Howell, A.","Rosand, B.","Anticevic, A.","Sotiropoulos, S. N.","Murray, J. D."],"key":"helmer_stability_2024","id":"helmer_stability_2024","bibbaseid":"helmer-warrington-mohammadinejad-ji-howell-rosand-anticevic-sotiropoulos-etal-onthestabilityofcanonicalcorrelationanalysisandpartialleastsquareswithapplicationtobrainbehaviorassociations-2024","role":"author","urls":{"Paper":"https://www.nature.com/articles/s42003-024-05869-4"},"keyword":["Cognitive neuroscience","Computational neuroscience","Statistical methods"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"article","biburl":"https://bibbase.org/zotero-group/peerherholz/5704536","dataSources":["6GhResmoSoCCHimNC"],"keywords":["cognitive neuroscience","computational neuroscience","statistical methods"],"search_terms":["stability","canonical","correlation","analysis","partial","squares","application","brain","behavior","associations","helmer","warrington","mohammadi-nejad","ji","howell","rosand","anticevic","sotiropoulos","murray"],"title":"On the stability of canonical correlation analysis and partial least squares with application to brain-behavior associations","year":2024}