HarpLDA+: Optimizing latent dirichlet allocation for parallel efficiency. Peng, B., Zhang, B., Chen, L., Avram, M., Henschel, R., Stewart, C., Zhu, S., McCallum, E., Smith, L., Zahniser, T., Omer, J., & Qiu, J. In Proceedings - 2017 IEEE International Conference on Big Data, Big Data 2017, volume 2018-Janua, pages 243-252, 2018. Institute of Electrical and Electronics Engineers Inc..
HarpLDA+: Optimizing latent dirichlet allocation for parallel efficiency [link]Website  doi  abstract   bibtex   
Latent Dirichlet Allocation (LDA) is a widely used machine learning technique in topic modeling and data analysis. Training large LDA models on big datasets involves dynamic and irregular computation patterns and is a major challenge to both algorithm optimization and system design. In this paper, we present a comprehensive benchmarking of our novel synchronized LDA training system HarpLDA+ based on Hadoop and Java. It demonstrates impressive performance when compared to three other MPI/C++ based state-of-the-art systems, which are LightLDA, F+NomadLDA, and WarpLDA. HarpLDA+ uses optimized collective communication with a timer control for load balance, leading to stable scalability in both shared-memory and distributed systems. We demonstrate in the experiments that HarpLDA+ is effective in reducing synchronization and communication overhead and outperforms the other three LDA training systems. © 2017 IEEE.
@inproceedings{
 title = {HarpLDA+: Optimizing latent dirichlet allocation for parallel efficiency},
 type = {inproceedings},
 year = {2018},
 keywords = {Algorithm optimization; Collective communications,Big data,Learning systems; Statistics},
 pages = {243-252},
 volume = {2018-Janua},
 websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85047792604&doi=10.1109%2FBigData.2017.8257932&partnerID=40&md5=634b9836d11831c51e661b9e5e90bcd9},
 publisher = {Institute of Electrical and Electronics Engineers Inc.},
 id = {ce97ebfa-0470-3e41-8c53-c8c9e7f45275},
 created = {2018-06-25T18:22:28.432Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 last_modified = {2020-09-09T19:33:20.152Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Peng2018243},
 source_type = {conference},
 notes = {cited By 0; Conference of 5th IEEE International Conference on Big Data, Big Data 2017 ; Conference Date: 11 December 2017 Through 14 December 2017; Conference Code:134260},
 folder_uuids = {089a8687-5c2e-4a40-91e2-0a855ea1eb95},
 private_publication = {false},
 abstract = {Latent Dirichlet Allocation (LDA) is a widely used machine learning technique in topic modeling and data analysis. Training large LDA models on big datasets involves dynamic and irregular computation patterns and is a major challenge to both algorithm optimization and system design. In this paper, we present a comprehensive benchmarking of our novel synchronized LDA training system HarpLDA+ based on Hadoop and Java. It demonstrates impressive performance when compared to three other MPI/C++ based state-of-the-art systems, which are LightLDA, F+NomadLDA, and WarpLDA. HarpLDA+ uses optimized collective communication with a timer control for load balance, leading to stable scalability in both shared-memory and distributed systems. We demonstrate in the experiments that HarpLDA+ is effective in reducing synchronization and communication overhead and outperforms the other three LDA training systems. © 2017 IEEE.},
 bibtype = {inproceedings},
 author = {Peng, B and Zhang, B and Chen, L and Avram, M and Henschel, R and Stewart, C and Zhu, S and McCallum, E and Smith, L and Zahniser, T and Omer, J and Qiu, J},
 editor = {Obradovic Z. Baeza-Yates R., Kepner J Nambiar R Wang C Toyoda M Suzumura T Hu X Cuzzocrea A Baeza-Yates R Tang J Zang H Nie J.-Y. Ghosh R},
 doi = {10.1109/BigData.2017.8257932},
 booktitle = {Proceedings - 2017 IEEE International Conference on Big Data, Big Data 2017}
}

Downloads: 0