Hydration free energies from kernel-based machine learning: Compound-database bias. Rauer, C. & Bereau, T. AIP Publishing, LLC, 2020.
Hydration free energies from kernel-based machine learning: Compound-database bias [link]Website  doi  abstract   bibtex   
We consider the prediction of a basic thermodynamic property---hydration free energies---across a large subset of the chemical space of small organic molecules. Our in silico study is based on computer simulations at the atomistic level with implicit solvent. We report on a kernel-based machine learning approach that is inspired by recent work in learning electronic properties, but differs in key aspects: The representation is averaged over several conformers to account for the statistical ensemble. We also include an atomic-decomposition ansatz, which we show offers significant added transferability compared to molecular learning. Finally, we explore the existence of severe biases from databases of experimental compounds. By performing a combination of dimensionality reduction and cross-learning models, we show that the rate of learning depends significantly on the breadth and variety of the training dataset. Our study highlights the dangers of fitting machine-learning models to databases of narrow chemical range.
@article{
 title = {Hydration free energies from kernel-based machine learning: Compound-database bias},
 type = {article},
 year = {2020},
 volume = {014101},
 websites = {http://arxiv.org/abs/2007.00407%0Ahttp://dx.doi.org/10.1063/5.0012230},
 publisher = {AIP Publishing, LLC},
 id = {3e886615-189d-32f9-989f-dbde7aa55065},
 created = {2020-07-23T15:33:54.143Z},
 file_attached = {false},
 profile_id = {6a122574-9fbc-32f4-b166-53bc4f07b051},
 group_id = {44f966a6-5fc6-3fbf-92d0-5cda043c5bd2},
 last_modified = {2020-07-23T15:33:54.143Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Rauer2020},
 private_publication = {false},
 abstract = {We consider the prediction of a basic thermodynamic property---hydration free energies---across a large subset of the chemical space of small organic molecules. Our in silico study is based on computer simulations at the atomistic level with implicit solvent. We report on a kernel-based machine learning approach that is inspired by recent work in learning electronic properties, but differs in key aspects: The representation is averaged over several conformers to account for the statistical ensemble. We also include an atomic-decomposition ansatz, which we show offers significant added transferability compared to molecular learning. Finally, we explore the existence of severe biases from databases of experimental compounds. By performing a combination of dimensionality reduction and cross-learning models, we show that the rate of learning depends significantly on the breadth and variety of the training dataset. Our study highlights the dangers of fitting machine-learning models to databases of narrow chemical range.},
 bibtype = {article},
 author = {Rauer, Clemens and Bereau, Tristan},
 doi = {10.1063/5.0012230},
 number = {1}
}

Downloads: 0