var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=https%3A%2F%2Fawan-10.github.io%2Fammar.bib&jsonp=1&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=https%3A%2F%2Fawan-10.github.io%2Fammar.bib&jsonp=1\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=https%3A%2F%2Fawan-10.github.io%2Fammar.bib&jsonp=1\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n 2020\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Communication Profiling and Characterization of Deep-Learning Workloads on Clusters With High-Performance Interconnects.\n \n \n \n\n\n \n A. A. Awan; Jain, A.; Chu, C.; Subramoni, H.; and Panda, D.\n\n\n \n\n\n\n IEEE Micro, 40(1): 35-43. Jan 2020.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@ARTICLE{awan-micro19, \nauthor={{A. A. Awan} and A. Jain and C-H Chu and H. Subramoni and DK Panda}, \njournal={IEEE Micro}, \ntitle={{Communication Profiling and Characterization of\nDeep-Learning Workloads on Clusters With High-Performance\nInterconnects}},\nyear={2020}, \nvolume={40}, \nnumber={1}, \npages={35-43}, \nkeywords={Training;Libraries;Measurement;Middleware;Deep learning;Graphics processing units;Performance analysis;InfiniBand;Omni-Path;NVLink;PCIe;TensorFlow;Horovod;MVAPICH2 MPI;Performance Analysis;Profiling;Communication Libraries}, \ndoi={10.1109/MM.2019.2949986}, \nISSN={1937-4143}, \nmonth={Jan},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2019\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Optimized Large-message Broadcast for Deep Learning Workloads: MPI, MPI+NCCL, or NCCL2?.\n \n \n \n \n\n\n \n A. A. Awan; Manian, K.; Chu, C.; Subramoni, H.; and Panda, D.\n\n\n \n\n\n\n Parallel Computing, 85: 141 - 152. 2019.\n \n\n\n\n
\n\n\n\n \n \n \"OptimizedPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{awan-parco19,\ntitle = {{Optimized Large-message Broadcast for Deep Learning Workloads: MPI, MPI+NCCL, or NCCL2?}},\njournal = "Parallel Computing",\nvolume = "85",\npages = "141 - 152",\nyear = "2019",\nissn = "0167-8191",\ndoi = "10.1016/j.parco.2019.03.005",\nurl = "http://www.sciencedirect.com/science/article/pii/S0167819118303284",\nauthor = "{A. A. Awan} and KV Manian and C-H Chu and H. Subramoni and DK Panda",\nkeywords = "HPC, Distributed deep learning, MPI_Bcast, NCCL, CUDA-Aware MPI",\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Communication Profiling and Characterization of Deep Learning Workloads on Clusters with High-Performance Interconnects.\n \n \n \n\n\n \n A. A. Awan; Jain, A.; Subramoni, H.; Chu, C.; and Panda, D.\n\n\n \n\n\n\n In 26th Symposium on IEEE Hot Interconnects (HotI), Aug 2019. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{awan-hoti19,\nauthor = {{A. A. Awan} and A. Jain and H. Subramoni and C-H Chu and DK Panda},\nbooktitle = {26th Symposium on IEEE Hot Interconnects (HotI)},\nmonth = {Aug},\ntitle = {{Communication Profiling and Characterization of Deep Learning Workloads on Clusters with High-Performance Interconnects}},\nyear = {2019}\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Scalable Distributed DNN Training using TensorFlow and CUDA-Aware MPI: Characterization, Designs, and Performance Evaluation.\n \n \n \n\n\n \n A. A. Awan; Bédorf, J.; Chu, C.; Subramoni, H.; and Panda, D. K.\n\n\n \n\n\n\n In 2019 19th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID), pages 498-507, May 2019. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@INPROCEEDINGS{awan-ccgrid19, \nauthor={{A. A. Awan} and J. {Bédorf} and C. {Chu} and H. {Subramoni} and D. K. {Panda}}, \nbooktitle={2019 19th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGRID)}, \ntitle={Scalable Distributed DNN Training using TensorFlow and CUDA-Aware MPI: Characterization, Designs, and Performance Evaluation}, \nyear={2019}, \nvolume={}, \nnumber={}, \npages={498-507}, \ndoi={10.1109/CCGRID.2019.00064}, \nISSN={}, \nmonth={May},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2018\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Optimized Broadcast for Deep Learning Workloads on Dense-GPU InfiniBand Clusters: MPI or NCCL?.\n \n \n \n \n\n\n \n Awan, A. A.; Chu, C.; Subramoni, H.; and Panda, D. K.\n\n\n \n\n\n\n In Proceedings of the 25th European MPI Users' Group Meeting, of EuroMPI'18, pages 2:1–2:9, New York, NY, USA, 2018. ACM\n \n\n\n\n
\n\n\n\n \n \n \"OptimizedPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{awan-eurompi18,\n author = {Awan, Ammar Ahmad and Chu, Ching-Hsiang and Subramoni, Hari and Panda, Dhabaleswar K.},\n title = {{Optimized Broadcast for Deep Learning Workloads on Dense-GPU InfiniBand Clusters: MPI or NCCL?}},\n booktitle = {Proceedings of the 25th European MPI Users' Group Meeting},\n series = {EuroMPI'18},\n year = {2018},\n isbn = {978-1-4503-6492-8},\n location = {Barcelona, Spain},\n pages = {2:1--2:9},\n articleno = {2},\n numpages = {9},\n url = {http://doi.acm.org/10.1145/3236367.3236381},\n doi = {10.1145/3236367.3236381},\n acmid = {3236381},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {CUDA-Aware MPI, Distributed Deep Learning, HPC, MPI_Bcast, NCCL},\n} \n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n OC-DNN: Exploiting Advanced Unified Memory Capabilities in CUDA 9 and Volta GPUs for Out-of-Core DNN Training.\n \n \n \n\n\n \n A. A. Awan; Chu, C.; Subramoni, H.; Lu, X.; and Panda, D.\n\n\n \n\n\n\n In 2018 IEEE 25th International Conference on High Performance Computing (HiPC), pages 143–152, dec 2018. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{awan-hipc18,\nauthor = {{A. A. Awan} and C-H Chu and H. Subramoni and X. Lu and DK Panda},\nbooktitle = {2018 IEEE 25th International Conference on High Performance Computing (HiPC)},\ndoi = {10.1109/HiPC.2018.00024},\nissn = {2640-0316},\nkeywords = {graphics processing units;learning (artificial int},\nmonth = {dec},\npages = {143--152},\ntitle = {{OC-DNN: Exploiting Advanced Unified Memory Capabilities in CUDA 9 and Volta GPUs for Out-of-Core DNN Training}},\nyear = {2018}\n}\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2017\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n S-Caffe: Co-designing MPI Runtimes and Caffe for Scalable Deep Learning on Modern GPU Clusters.\n \n \n \n \n\n\n \n Awan, A. A.; Hamidouche, K.; Hashmi, J. M.; and Panda, D. K.\n\n\n \n\n\n\n In Proceedings of the 22Nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, of PPoPP '17, pages 193–205, New York, NY, USA, 2017. ACM\n \n\n\n\n
\n\n\n\n \n \n \"S-Caffe:Paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{awan-ppopp17,\n author = {Awan, Ammar Ahmad and Hamidouche, Khaled and Hashmi, Jahanzeb Maqbool and Panda, Dhabaleswar K.},\n title = {{S-Caffe: Co-designing MPI Runtimes and Caffe for Scalable Deep Learning on Modern GPU Clusters}},\n booktitle = {Proceedings of the 22Nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},\n series = {PPoPP '17},\n year = {2017},\n isbn = {978-1-4503-4493-7},\n location = {Austin, Texas, USA},\n pages = {193--205},\n numpages = {13},\n url = {http://doi.acm.org/10.1145/3018743.3018769},\n doi = {10.1145/3018743.3018769},\n acmid = {3018769},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {caffe, cuda-aware mpi, deep learning, distributed training, mpi\\_reduce},\n} \n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An In-depth Performance Characterization of CPU- and GPU-based DNN Training on Modern Architectures.\n \n \n \n \n\n\n \n A. A. Awan; Subramoni, H.; and Panda, D.\n\n\n \n\n\n\n In Proceedings of the Machine Learning on HPC Environments, of MLHPC'17, pages 8:1–8:8, New York, NY, USA, 2017. ACM\n \n\n\n\n
\n\n\n\n \n \n \"AnPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{awan-mlhpc17, \n author = {{A. A. Awan} and H. Subramoni and DK Panda},\n title = {{An In-depth Performance Characterization of CPU- and GPU-based DNN\n     Training on Modern Architectures}},\n booktitle = {Proceedings of the Machine Learning on HPC Environments},\n series = {MLHPC'17},\n year = {2017},\n isbn = {978-1-4503-5137-9},\n location = {Denver, CO, USA},\n pages = {8:1--8:8},\n articleno = {8},\n numpages = {8},\n url = {http://doi.acm.org/10.1145/3146347.3146356},\n doi = {10.1145/3146347.3146356},\n acmid = {3146356},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {Caffe, Deep Learning, High-Performance Computing, Pascal Architecture, Unified Memory},\n} \n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2016\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Efficient Large Message Broadcast Using NCCL and CUDA-Aware MPI for Deep Learning.\n \n \n \n \n\n\n \n A. A. Awan,; Hamidouche, K.; Venkatesh, A.; and Panda, D.\n\n\n \n\n\n\n In Proceedings of the 23rd European MPI Users' Group Meeting, of EuroMPI 2016, pages 15–22, New York, NY, USA, 2016. ACM\n \n\n\n\n
\n\n\n\n \n \n \"EfficientPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{awan-eurompi16,\n author = {{A. A. Awan,} and Hamidouche, K. and Venkatesh, A. and Panda, DK},\n title = {{Efficient Large Message Broadcast Using NCCL and CUDA-Aware MPI for\nDeep Learning}},\n booktitle = {Proceedings of the 23rd European MPI Users' Group Meeting},\n series = {EuroMPI 2016},\n year = {2016},\n isbn = {978-1-4503-4234-6},\n location = {Edinburgh, United Kingdom},\n pages = {15--22},\n numpages = {8},\n url = {http://doi.acm.org/10.1145/2966884.2966912},\n doi = {10.1145/2966884.2966912},\n acmid = {2966912},\n publisher = {ACM},\n address = {New York, NY, USA},\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);