Hydration free energies from kernel-based machine learning: Compound-database bias. Rauer, C. & Bereau, T. AIP Publishing, LLC, 2020.
Website doi abstract bibtex We consider the prediction of a basic thermodynamic property---hydration free energies---across a large subset of the chemical space of small organic molecules. Our in silico study is based on computer simulations at the atomistic level with implicit solvent. We report on a kernel-based machine learning approach that is inspired by recent work in learning electronic properties, but differs in key aspects: The representation is averaged over several conformers to account for the statistical ensemble. We also include an atomic-decomposition ansatz, which we show offers significant added transferability compared to molecular learning. Finally, we explore the existence of severe biases from databases of experimental compounds. By performing a combination of dimensionality reduction and cross-learning models, we show that the rate of learning depends significantly on the breadth and variety of the training dataset. Our study highlights the dangers of fitting machine-learning models to databases of narrow chemical range.
@article{
title = {Hydration free energies from kernel-based machine learning: Compound-database bias},
type = {article},
year = {2020},
volume = {014101},
websites = {http://arxiv.org/abs/2007.00407%0Ahttp://dx.doi.org/10.1063/5.0012230},
publisher = {AIP Publishing, LLC},
id = {3e886615-189d-32f9-989f-dbde7aa55065},
created = {2020-07-23T15:33:54.143Z},
file_attached = {false},
profile_id = {6a122574-9fbc-32f4-b166-53bc4f07b051},
group_id = {44f966a6-5fc6-3fbf-92d0-5cda043c5bd2},
last_modified = {2020-07-23T15:33:54.143Z},
read = {false},
starred = {false},
authored = {false},
confirmed = {true},
hidden = {false},
citation_key = {Rauer2020},
private_publication = {false},
abstract = {We consider the prediction of a basic thermodynamic property---hydration free energies---across a large subset of the chemical space of small organic molecules. Our in silico study is based on computer simulations at the atomistic level with implicit solvent. We report on a kernel-based machine learning approach that is inspired by recent work in learning electronic properties, but differs in key aspects: The representation is averaged over several conformers to account for the statistical ensemble. We also include an atomic-decomposition ansatz, which we show offers significant added transferability compared to molecular learning. Finally, we explore the existence of severe biases from databases of experimental compounds. By performing a combination of dimensionality reduction and cross-learning models, we show that the rate of learning depends significantly on the breadth and variety of the training dataset. Our study highlights the dangers of fitting machine-learning models to databases of narrow chemical range.},
bibtype = {article},
author = {Rauer, Clemens and Bereau, Tristan},
doi = {10.1063/5.0012230},
number = {1}
}
Downloads: 0
{"_id":"dFTgod3xt7BWrLtRL","bibbaseid":"rauer-bereau-hydrationfreeenergiesfromkernelbasedmachinelearningcompounddatabasebias-2020","authorIDs":["skQbwmBqs7oknTATk"],"author_short":["Rauer, C.","Bereau, T."],"bibdata":{"title":"Hydration free energies from kernel-based machine learning: Compound-database bias","type":"article","year":"2020","volume":"014101","websites":"http://arxiv.org/abs/2007.00407%0Ahttp://dx.doi.org/10.1063/5.0012230","publisher":"AIP Publishing, LLC","id":"3e886615-189d-32f9-989f-dbde7aa55065","created":"2020-07-23T15:33:54.143Z","file_attached":false,"profile_id":"6a122574-9fbc-32f4-b166-53bc4f07b051","group_id":"44f966a6-5fc6-3fbf-92d0-5cda043c5bd2","last_modified":"2020-07-23T15:33:54.143Z","read":false,"starred":false,"authored":false,"confirmed":"true","hidden":false,"citation_key":"Rauer2020","private_publication":false,"abstract":"We consider the prediction of a basic thermodynamic property---hydration free energies---across a large subset of the chemical space of small organic molecules. Our in silico study is based on computer simulations at the atomistic level with implicit solvent. We report on a kernel-based machine learning approach that is inspired by recent work in learning electronic properties, but differs in key aspects: The representation is averaged over several conformers to account for the statistical ensemble. We also include an atomic-decomposition ansatz, which we show offers significant added transferability compared to molecular learning. Finally, we explore the existence of severe biases from databases of experimental compounds. By performing a combination of dimensionality reduction and cross-learning models, we show that the rate of learning depends significantly on the breadth and variety of the training dataset. Our study highlights the dangers of fitting machine-learning models to databases of narrow chemical range.","bibtype":"article","author":"Rauer, Clemens and Bereau, Tristan","doi":"10.1063/5.0012230","number":"1","bibtex":"@article{\n title = {Hydration free energies from kernel-based machine learning: Compound-database bias},\n type = {article},\n year = {2020},\n volume = {014101},\n websites = {http://arxiv.org/abs/2007.00407%0Ahttp://dx.doi.org/10.1063/5.0012230},\n publisher = {AIP Publishing, LLC},\n id = {3e886615-189d-32f9-989f-dbde7aa55065},\n created = {2020-07-23T15:33:54.143Z},\n file_attached = {false},\n profile_id = {6a122574-9fbc-32f4-b166-53bc4f07b051},\n group_id = {44f966a6-5fc6-3fbf-92d0-5cda043c5bd2},\n last_modified = {2020-07-23T15:33:54.143Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {true},\n hidden = {false},\n citation_key = {Rauer2020},\n private_publication = {false},\n abstract = {We consider the prediction of a basic thermodynamic property---hydration free energies---across a large subset of the chemical space of small organic molecules. Our in silico study is based on computer simulations at the atomistic level with implicit solvent. We report on a kernel-based machine learning approach that is inspired by recent work in learning electronic properties, but differs in key aspects: The representation is averaged over several conformers to account for the statistical ensemble. We also include an atomic-decomposition ansatz, which we show offers significant added transferability compared to molecular learning. Finally, we explore the existence of severe biases from databases of experimental compounds. By performing a combination of dimensionality reduction and cross-learning models, we show that the rate of learning depends significantly on the breadth and variety of the training dataset. Our study highlights the dangers of fitting machine-learning models to databases of narrow chemical range.},\n bibtype = {article},\n author = {Rauer, Clemens and Bereau, Tristan},\n doi = {10.1063/5.0012230},\n number = {1}\n}","author_short":["Rauer, C.","Bereau, T."],"urls":{"Website":"http://arxiv.org/abs/2007.00407%0Ahttp://dx.doi.org/10.1063/5.0012230"},"biburl":"https://bibbase.org/service/mendeley/6a122574-9fbc-32f4-b166-53bc4f07b051","bibbaseid":"rauer-bereau-hydrationfreeenergiesfromkernelbasedmachinelearningcompounddatabasebias-2020","role":"author","metadata":{"authorlinks":{"bereau, t":"https://iop.fnwi.uva.nl/computational_soft_matter/publications.html"}},"downloads":0},"bibtype":"article","biburl":"https://bibbase.org/service/mendeley/6a122574-9fbc-32f4-b166-53bc4f07b051","creationDate":"2020-08-13T09:31:26.901Z","downloads":0,"keywords":[],"search_terms":["hydration","free","energies","kernel","based","machine","learning","compound","database","bias","rauer","bereau"],"title":"Hydration free energies from kernel-based machine learning: Compound-database bias","year":2020,"dataSources":["REPLekuv2nGHh2io6","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}