Optimizing latency and throughput for spawning processes on massively multicore processors

Optimizing latency and throughput for spawning processes on massively multicore processors. Lumsdaine, A., K., A., Ionkov, M., L., L., Kulkarni, A., Lumsdaine, A., K., A., Lang, M., & Ionkov, L. In Proceedings of the 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012, 2012.

Website doi abstract bibtex

The execution of a SPMD application involves running multiple instances of a process with possibly varying arguments. With the widespread adoption of massively multicore processors, there has been a focus towards harnessing the abundant compute resources effectively in a power-efficient manner. Although much work has been done towards optimizing distributed process launch using hierarchical techniques, there has been a void in studying the performance of spawning processes within a single node. Reducing the latency to spawn a new process locally results in faster global job launch. Further, emerging dynamic and resilient execution models are designed on the premise of maintaining process pools for fault isolation and launching several processes in a relatively shorter period of time. Optimizing the latency and throughput for spawning processes would help improve the overall performance of runtime systems, allow adaptive process-replication reliability and motivate the design and implementation of process management interfaces in future manycore operating systems. In this paper, we study the several limiting factors for efficient spawning of processes on massively multicore architectures. We have developed a library to optimize launching multiple instances of the same executable. Our microbenchmarks show a 20-80% decrease in the process spawn time for multiple executables. We further discuss the effects of memory locality and propose NUMA-aware extensions to optimize launching processes with large memory-mapped segments including dynamic shared libraries. Finally, we describe vector operating system interfaces for spawning a batch of processes from a given executable on specific cores. Our results show a 50x speedup over the traditional method of launching new processes using fork and exec system calls. © 2012 ACM.

@inproceedings{
 title = {Optimizing latency and throughput for spawning processes on massively multicore processors},
 type = {inproceedings},
 year = {2012},
 keywords = {Compute resources,Distributed process,Executable,Intelligent control,Launching,Optimization,Software architec},
 websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84864764857&doi=10.1145%2F2318916.2318924&partnerID=40&md5=9206e53cd830e95869cf274a2eb31845},
 city = {Venice},
 id = {ec553bc7-98a2-3174-a8ad-5db62152fec5},
 created = {2017-11-27T19:30:57.063Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 last_modified = {2018-03-12T19:03:32.192Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Kulkarni2012},
 source_type = {conference},
 notes = {<b>From Duplicate 1 (<i>Optimizing latency and throughput for spawning processes on massively multicore processors</i> - Kulkarni, A; Lumsdaine, A; Lang, M; Ionkov, L)<br/></b><br/>cited By 2; Conference of 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012 ; Conference Date: 29 June 2012 Through 29 June 2012; Conference Code:91612},
 folder_uuids = {a0f5ac31-a393-4a7b-b7db-64a126a80f6e},
 private_publication = {false},
 abstract = {The execution of a SPMD application involves running multiple instances of a process with possibly varying arguments. With the widespread adoption of massively multicore processors, there has been a focus towards harnessing the abundant compute resources effectively in a power-efficient manner. Although much work has been done towards optimizing distributed process launch using hierarchical techniques, there has been a void in studying the performance of spawning processes within a single node. Reducing the latency to spawn a new process locally results in faster global job launch. Further, emerging dynamic and resilient execution models are designed on the premise of maintaining process pools for fault isolation and launching several processes in a relatively shorter period of time. Optimizing the latency and throughput for spawning processes would help improve the overall performance of runtime systems, allow adaptive process-replication reliability and motivate the design and implementation of process management interfaces in future manycore operating systems. In this paper, we study the several limiting factors for efficient spawning of processes on massively multicore architectures. We have developed a library to optimize launching multiple instances of the same executable. Our microbenchmarks show a 20-80% decrease in the process spawn time for multiple executables. We further discuss the effects of memory locality and propose NUMA-aware extensions to optimize launching processes with large memory-mapped segments including dynamic shared libraries. Finally, we describe vector operating system interfaces for spawning a batch of processes from a given executable on specific cores. Our results show a 50x speedup over the traditional method of launching new processes using fork and exec system calls. © 2012 ACM.},
 bibtype = {inproceedings},
 author = {Lumsdaine, Abhishek Kulkarni Andrew and Ionkov, Michael Lang Latchesar and Kulkarni, A and Lumsdaine, Abhishek Kulkarni Andrew and Lang, M and Ionkov, L},
 doi = {10.1145/2318916.2318924},
 booktitle = {Proceedings of the 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012}
}

Downloads: 0

{"_id":"cbdPyxf2ZAGDZMgHy","bibbaseid":"lumsdaine-ionkov-kulkarni-lumsdaine-lang-ionkov-optimizinglatencyandthroughputforspawningprocessesonmassivelymulticoreprocessors-2012","downloads":0,"creationDate":"2018-03-12T19:10:27.404Z","title":"Optimizing latency and throughput for spawning processes on massively multicore processors","author_short":["Lumsdaine, A., K., A.","Ionkov, M., L., L.","Kulkarni, A.","Lumsdaine, A., K., A.","Lang, M.","Ionkov, L."],"year":2012,"bibtype":"inproceedings","biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","bibdata":{"title":"Optimizing latency and throughput for spawning processes on massively multicore processors","type":"inproceedings","year":"2012","keywords":"Compute resources,Distributed process,Executable,Intelligent control,Launching,Optimization,Software architec","websites":"https://www.scopus.com/inward/record.uri?eid=2-s2.0-84864764857&doi=10.1145%2F2318916.2318924&partnerID=40&md5=9206e53cd830e95869cf274a2eb31845","city":"Venice","id":"ec553bc7-98a2-3174-a8ad-5db62152fec5","created":"2017-11-27T19:30:57.063Z","file_attached":false,"profile_id":"42d295c0-0737-38d6-8b43-508cab6ea85d","last_modified":"2018-03-12T19:03:32.192Z","read":false,"starred":false,"authored":"true","confirmed":"true","hidden":false,"citation_key":"Kulkarni2012","source_type":"conference","notes":"From Duplicate 1 (Optimizing latency and throughput for spawning processes on massively multicore processors - Kulkarni, A; Lumsdaine, A; Lang, M; Ionkov, L) cited By 2; Conference of 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012 ; Conference Date: 29 June 2012 Through 29 June 2012; Conference Code:91612","folder_uuids":"a0f5ac31-a393-4a7b-b7db-64a126a80f6e","private_publication":false,"abstract":"The execution of a SPMD application involves running multiple instances of a process with possibly varying arguments. With the widespread adoption of massively multicore processors, there has been a focus towards harnessing the abundant compute resources effectively in a power-efficient manner. Although much work has been done towards optimizing distributed process launch using hierarchical techniques, there has been a void in studying the performance of spawning processes within a single node. Reducing the latency to spawn a new process locally results in faster global job launch. Further, emerging dynamic and resilient execution models are designed on the premise of maintaining process pools for fault isolation and launching several processes in a relatively shorter period of time. Optimizing the latency and throughput for spawning processes would help improve the overall performance of runtime systems, allow adaptive process-replication reliability and motivate the design and implementation of process management interfaces in future manycore operating systems. In this paper, we study the several limiting factors for efficient spawning of processes on massively multicore architectures. We have developed a library to optimize launching multiple instances of the same executable. Our microbenchmarks show a 20-80% decrease in the process spawn time for multiple executables. We further discuss the effects of memory locality and propose NUMA-aware extensions to optimize launching processes with large memory-mapped segments including dynamic shared libraries. Finally, we describe vector operating system interfaces for spawning a batch of processes from a given executable on specific cores. Our results show a 50x speedup over the traditional method of launching new processes using fork and exec system calls. © 2012 ACM.","bibtype":"inproceedings","author":"Lumsdaine, Abhishek Kulkarni Andrew and Ionkov, Michael Lang Latchesar and Kulkarni, A and Lumsdaine, Abhishek Kulkarni Andrew and Lang, M and Ionkov, L","doi":"10.1145/2318916.2318924","booktitle":"Proceedings of the 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012","bibtex":"@inproceedings{\n title = {Optimizing latency and throughput for spawning processes on massively multicore processors},\n type = {inproceedings},\n year = {2012},\n keywords = {Compute resources,Distributed process,Executable,Intelligent control,Launching,Optimization,Software architec},\n websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84864764857&doi=10.1145%2F2318916.2318924&partnerID=40&md5=9206e53cd830e95869cf274a2eb31845},\n city = {Venice},\n id = {ec553bc7-98a2-3174-a8ad-5db62152fec5},\n created = {2017-11-27T19:30:57.063Z},\n file_attached = {false},\n profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},\n last_modified = {2018-03-12T19:03:32.192Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {true},\n hidden = {false},\n citation_key = {Kulkarni2012},\n source_type = {conference},\n notes = {From Duplicate 1 (Optimizing latency and throughput for spawning processes on massively multicore processors - Kulkarni, A; Lumsdaine, A; Lang, M; Ionkov, L) cited By 2; Conference of 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012 ; Conference Date: 29 June 2012 Through 29 June 2012; Conference Code:91612},\n folder_uuids = {a0f5ac31-a393-4a7b-b7db-64a126a80f6e},\n private_publication = {false},\n abstract = {The execution of a SPMD application involves running multiple instances of a process with possibly varying arguments. With the widespread adoption of massively multicore processors, there has been a focus towards harnessing the abundant compute resources effectively in a power-efficient manner. Although much work has been done towards optimizing distributed process launch using hierarchical techniques, there has been a void in studying the performance of spawning processes within a single node. Reducing the latency to spawn a new process locally results in faster global job launch. Further, emerging dynamic and resilient execution models are designed on the premise of maintaining process pools for fault isolation and launching several processes in a relatively shorter period of time. Optimizing the latency and throughput for spawning processes would help improve the overall performance of runtime systems, allow adaptive process-replication reliability and motivate the design and implementation of process management interfaces in future manycore operating systems. In this paper, we study the several limiting factors for efficient spawning of processes on massively multicore architectures. We have developed a library to optimize launching multiple instances of the same executable. Our microbenchmarks show a 20-80% decrease in the process spawn time for multiple executables. We further discuss the effects of memory locality and propose NUMA-aware extensions to optimize launching processes with large memory-mapped segments including dynamic shared libraries. Finally, we describe vector operating system interfaces for spawning a batch of processes from a given executable on specific cores. Our results show a 50x speedup over the traditional method of launching new processes using fork and exec system calls. © 2012 ACM.},\n bibtype = {inproceedings},\n author = {Lumsdaine, Abhishek Kulkarni Andrew and Ionkov, Michael Lang Latchesar and Kulkarni, A and Lumsdaine, Abhishek Kulkarni Andrew and Lang, M and Ionkov, L},\n doi = {10.1145/2318916.2318924},\n booktitle = {Proceedings of the 2nd International Workshop on Runtime and Operating Systems for Supercomputers, ROSS 2012 - In Conjunction with: ICS 2012}\n}","author_short":["Lumsdaine, A., K., A.","Ionkov, M., L., L.","Kulkarni, A.","Lumsdaine, A., K., A.","Lang, M.","Ionkov, L."],"urls":{"Website":"https://www.scopus.com/inward/record.uri?eid=2-s2.0-84864764857&doi=10.1145%2F2318916.2318924&partnerID=40&md5=9206e53cd830e95869cf274a2eb31845"},"biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","bibbaseid":"lumsdaine-ionkov-kulkarni-lumsdaine-lang-ionkov-optimizinglatencyandthroughputforspawningprocessesonmassivelymulticoreprocessors-2012","role":"author","keyword":["Compute resources","Distributed process","Executable","Intelligent control","Launching","Optimization","Software architec"],"metadata":{"authorlinks":{}},"downloads":0},"search_terms":["optimizing","latency","throughput","spawning","processes","massively","multicore","processors","lumsdaine","ionkov","kulkarni","lumsdaine","lang","ionkov"],"keywords":["compute resources","distributed process","executable","intelligent control","launching","optimization","software architec"],"authorIDs":[],"dataSources":["zgahneP4uAjKbudrQ","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}