Scaling Large Learning Problems with Hard Parallel Mixtures. Collobert, R., Bengio, Y., & Bengio, S. International Journal on Pattern Recognition and Artificial Intelligence, IJPRAI, 17(3):349–365, 2003.
Paper abstract bibtex A challenge for \em statistical learning is to deal with large data sets, e.g. in \em data mining. The training time of ordinary Support Vector Machines is at least quadratic, which raises a serious research challenge if we want to deal with data sets of millions of examples. We propose a ``hard parallelizable mixture'' methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a ``gater'' model in such a way that it becomes easy to learn an ``expert'' model separately in each region of the partition. A probabilistic extension and the use of a set of generative models allows representing the gater so that all pieces of the model are locally trained. For SVMs, time complexity appears empirically to locally grow \em linearly with the number of examples, while \em generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.
@article{collobert:2003:ijprai,
author = {R. Collobert and Y. Bengio and S. Bengio},
title = {Scaling Large Learning Problems with Hard Parallel Mixtures},
journal = {International Journal on Pattern Recognition and Artificial Intelligence, {IJPRAI}},
volume = 17,
number = 3,
pages = {349--365},
year = 2003,
url = {publications/ps/collobert_2003_ijprai.ps.gz},
pdf = {publications/pdf/collobert_2003_ijprai.pdf},
djvu = {publications/djvu/collobert_2003_ijprai.djvu},
original = {2003/mixtures_ijprai},
web = {http://dx.doi.org/10.1142/S0218001403002411},
topics = {large_scale},
abstract = {A challenge for {\em statistical learning} is to deal with large data sets, e.g. in {\em data mining}. The training time of ordinary Support Vector Machines is at least quadratic, which raises a serious research challenge if we want to deal with data sets of millions of examples. We propose a ``hard parallelizable mixture'' methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a ``gater'' model in such a way that it becomes easy to learn an ``expert'' model separately in each region of the partition. A probabilistic extension and the use of a set of generative models allows representing the gater so that all pieces of the model are locally trained. For SVMs, time complexity appears empirically to locally grow {\em linearly} with the number of examples, while {\em generalization} performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.},
categorie = {A}
}
Downloads: 0
{"_id":"D6GuvsjPGnWNhZmJy","bibbaseid":"collobert-bengio-bengio-scalinglargelearningproblemswithhardparallelmixtures-2003","authorIDs":[],"author_short":["Collobert, R.","Bengio, Y.","Bengio, S."],"bibdata":{"bibtype":"article","type":"article","author":[{"firstnames":["R."],"propositions":[],"lastnames":["Collobert"],"suffixes":[]},{"firstnames":["Y."],"propositions":[],"lastnames":["Bengio"],"suffixes":[]},{"firstnames":["S."],"propositions":[],"lastnames":["Bengio"],"suffixes":[]}],"title":"Scaling Large Learning Problems with Hard Parallel Mixtures","journal":"International Journal on Pattern Recognition and Artificial Intelligence, IJPRAI","volume":"17","number":"3","pages":"349–365","year":"2003","url":"publications/ps/collobert_2003_ijprai.ps.gz","pdf":"publications/pdf/collobert_2003_ijprai.pdf","djvu":"publications/djvu/collobert_2003_ijprai.djvu","original":"2003/mixtures_ijprai","web":"http://dx.doi.org/10.1142/S0218001403002411","topics":"large_scale","abstract":"A challenge for \\em statistical learning is to deal with large data sets, e.g. in \\em data mining. The training time of ordinary Support Vector Machines is at least quadratic, which raises a serious research challenge if we want to deal with data sets of millions of examples. We propose a ``hard parallelizable mixture'' methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a ``gater'' model in such a way that it becomes easy to learn an ``expert'' model separately in each region of the partition. A probabilistic extension and the use of a set of generative models allows representing the gater so that all pieces of the model are locally trained. For SVMs, time complexity appears empirically to locally grow \\em linearly with the number of examples, while \\em generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.","categorie":"A","bibtex":"@article{collobert:2003:ijprai,\n author = {R. Collobert and Y. Bengio and S. Bengio},\n title = {Scaling Large Learning Problems with Hard Parallel Mixtures},\n journal = {International Journal on Pattern Recognition and Artificial Intelligence, {IJPRAI}},\n volume = 17,\n number = 3,\n pages = {349--365},\n year = 2003,\n url = {publications/ps/collobert_2003_ijprai.ps.gz},\n pdf = {publications/pdf/collobert_2003_ijprai.pdf},\n djvu = {publications/djvu/collobert_2003_ijprai.djvu},\n original = {2003/mixtures_ijprai},\n web = {http://dx.doi.org/10.1142/S0218001403002411},\n topics = {large_scale},\n abstract = {A challenge for {\\em statistical learning} is to deal with large data sets, e.g. in {\\em data mining}. The training time of ordinary Support Vector Machines is at least quadratic, which raises a serious research challenge if we want to deal with data sets of millions of examples. We propose a ``hard parallelizable mixture'' methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a ``gater'' model in such a way that it becomes easy to learn an ``expert'' model separately in each region of the partition. A probabilistic extension and the use of a set of generative models allows representing the gater so that all pieces of the model are locally trained. For SVMs, time complexity appears empirically to locally grow {\\em linearly} with the number of examples, while {\\em generalization} performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.},\n categorie = {A}\n}\n\n","author_short":["Collobert, R.","Bengio, Y.","Bengio, S."],"key":"collobert:2003:ijprai","id":"collobert:2003:ijprai","bibbaseid":"collobert-bengio-bengio-scalinglargelearningproblemswithhardparallelmixtures-2003","role":"author","urls":{"Paper":"http://bengio.abracadoudou.com/publications/ps/collobert_2003_ijprai.ps.gz"},"downloads":0},"bibtype":"article","biburl":"http://bengio.abracadoudou.com/samy.bib","creationDate":"2020-03-18T03:43:27.311Z","downloads":0,"keywords":[],"search_terms":["scaling","large","learning","problems","hard","parallel","mixtures","collobert","bengio","bengio"],"title":"Scaling Large Learning Problems with Hard Parallel Mixtures","year":2003,"dataSources":["9NCW2CDr4M3s5DvNX"]}