Implementing a Flexible, Fault Tolerant Job Management System for Science Gateways. Wannipurage, D., Marru, S., Piece, M., Abeysinghe, E., Pamidighantam, S., Christie, M., Shenoy, G., Dhamnaskar, A., & Jayathilaka, L. In Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning) - PEARC '19, pages 1-8, 2019. ACM Press. Paper Website doi abstract bibtex This paper summarizes our experiences evaluating and deploying a new task execution management system within the open source Apache Airavata framework for science gateways. We base our choices on our operational requirements and experiences running Airavata software as a multi-tenanted production service for multiple gateway clients. Our considerations include integrating semi-independent components, making major upgrades to those components while retaining the system's overall functionality, and choosing between integrating third party and in-house developed components. While we focus on Apache Airavata as the platform for evaluation, our results should be of general interest. After considering the options of extensions to our previous, in-house job management system using Apache Kafka or replacing it with Kubernetes, we ultimately chose Apache Helix, primarily for its ability to execute multiple tasks coupled into directed acyclic graphs. We have integrated this approach into Apache Airavata and have tested extensively over several months with many thousands of jobs, both from our internal throughput testing and operational tests with early adopter science gateway clients. The new system has proven to be at least as reliable as the previous system with the advantages that we now have simplified maintenance, do not need to support an in-house system that required extensive developer training to modify, and can support more sophisticated job execution scenarios.
@inproceedings{
title = {Implementing a Flexible, Fault Tolerant Job Management System for Science Gateways},
type = {inproceedings},
year = {2019},
pages = {1-8},
websites = {http://dl.acm.org/citation.cfm?doid=3332186.3332233},
publisher = {ACM Press},
city = {New York, New York, USA},
id = {81ea3925-59cb-30fb-8332-2af82619daec},
created = {2019-10-01T17:21:29.424Z},
accessed = {2019-09-12},
file_attached = {true},
profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
last_modified = {2020-05-11T14:43:46.062Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Wannipurage2019},
private_publication = {false},
abstract = {This paper summarizes our experiences evaluating and deploying a new task execution management system within the open source Apache Airavata framework for science gateways. We base our choices on our operational requirements and experiences running Airavata software as a multi-tenanted production service for multiple gateway clients. Our considerations include integrating semi-independent components, making major upgrades to those components while retaining the system's overall functionality, and choosing between integrating third party and in-house developed components. While we focus on Apache Airavata as the platform for evaluation, our results should be of general interest. After considering the options of extensions to our previous, in-house job management system using Apache Kafka or replacing it with Kubernetes, we ultimately chose Apache Helix, primarily for its ability to execute multiple tasks coupled into directed acyclic graphs. We have integrated this approach into Apache Airavata and have tested extensively over several months with many thousands of jobs, both from our internal throughput testing and operational tests with early adopter science gateway clients. The new system has proven to be at least as reliable as the previous system with the advantages that we now have simplified maintenance, do not need to support an in-house system that required extensive developer training to modify, and can support more sophisticated job execution scenarios.},
bibtype = {inproceedings},
author = {Wannipurage, Dimuthu and Marru, Suresh and Piece, Marlon and Abeysinghe, Eroma and Pamidighantam, Sudhakar and Christie, Marcus and Shenoy, Gourav and Dhamnaskar, Ajinkya and Jayathilaka, Lahiru},
doi = {10.1145/3332186.3332233},
booktitle = {Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning) - PEARC '19}
}
Downloads: 0
{"_id":"YBqrkJ7TvayPEL4Rh","bibbaseid":"wannipurage-marru-piece-abeysinghe-pamidighantam-christie-shenoy-dhamnaskar-etal-implementingaflexiblefaulttolerantjobmanagementsystemforsciencegateways-2019","authorIDs":["26TpkCWwrmpYgMESo","2cLQaiqKJtdrMW8gA","34XyRvwrKEz28h5HM","3JHMZbw6Ybw6CPpMp","3cbpf5NaiqFRLocDD","3rXAgvuZuMewLMmS3","4M7LHNsndiDDoYBnR","4PCkR9QSMDcnTsGcd","4Qj3AdLavbzSFHJun","4tnusnsz5WektoLvh","4zJHhmXsP6CZ89TzR","5MpqyLB8F2davtdfy","5XmPyBfeEvRDFjQxw","5aa6bdb46f3db6da06000078","5de743946a9377de0100009f","5de894e62c5eb2df010000a8","5de9e63d9f521ddf010000d4","5deb383229c0bddf01000073","5dec89f912c53edf01000090","5deddc850e26d5de010000d0","5def2d93e83f7dde010000a2","5df07e86e49680f2010000bb","5df1d0051070c8ef010000b5","5df322403b310cde010000d4","5df4728d95416ade01000082","5df5c46e64d1d0df01000066","5df71661d581f5df010000a4","5df8685ab99bcff30100007e","5df9b9cea344e2ef010000a3","5dfb0b17a8dc13ee01000090","5dfc5e8cecededde01000075","5dfdae0ecdf7c8de0100006b","5dff0172e25b1bde01000081","5e00513063155bde01000067","5e01a2d6b81d0dde01000069","5e02f3f30aba75df0100004e","5e0445a1705486df01000070","5e05976ed36daddf0100005e","5e06e7342eda19df0100006b","5e083a53f26b53de01000044","5e098b1a83a3f3de01000052","5e0adce2ee633cde01000070","5e0c2edd024014de0100003a","5e0d7ffda04810df0100008a","5e0ed0b6b68da9de01000064","5e1022982ef76bdf01000069","5e1174b07da100de0100010e","5e12c5df70e2c4f201000051","5e141573d64f35df0100005e","5e1568ce1e2528de01000075","5e16bab2dc7739de010000ca","5e180c339a3d0bde01000091","5e195c749eca7cdf01000059","5e1aaf6f9da435de01000061","5e1c009dbadffbde0100008d","5e1d53a0e32442de010000b9","5e1ea21fbedb58de0100007f","5e1f6dcbe8f5ddde010001a7","5e1ff45714d3c9de0100008b","5e2144503a0076de0100005a","5e229788a9b15cde01000079","5e2506682e79a1f20100008c","5e268c51d297d6de0100006a","5e27dd5668d625de01000099","5e292de577c165de01000049","5e2a7eb6881468de0100004e","5e2bd135dbdf7bdf0100004a","5e2d220a4e7fefde01000051","5e2e73408e08f0de01000074","5e2fc4bbe075a2df0100006f","5e3115b2714ff7de01000050","5e3267d65633c9de01000071","5e33ba477bf3ecde01000082","5e350acc8c7375de01000057","5e365c9b4b25bbf20100008f","5e37adf0e84c4cf20100009f","5e38ff2fdc5b8ade0100007a","5e3a643ab291e7df01000024","5e3ba26369c38bde01000086","5e3cf41bad8243de0100007d","5e3e45e7018e1dde01000044","5e3f9732cecf86de0100013d","5e40e9c7fd6934df010000a9","5e4239c6ac099bde0100007c","5e438c05639a35de0100008f","5e446222084293df01000012","5e446efe084293df010000f0","5e44de3aab9cedde01000095","5e462e527f6322df01000077","5e477fdb27a0c8de01000069","5e48d1dcd8606ddf01000055","5e4a23d2d2dbc8de01000078","5e4b74a3a6c158df01000086","5e4cc5dc0cc7d3de0100005b","5e4e1791d116fbde0100009f","5e4f6b36a01931de0100006a","5e50ba5fe3d144df01000048","5e520bcfbba759e801000051","5e535d8545815bf20100006e","5e54aef5929495df01000088","5e560076819fabdf01000065","5e57520c18f14bdf01000067","5e5833be1f3fc8de010000ad","5e58a389aadc3adf01000079","5e5921cb9bb6d2df010001bb","5e5935b4e60e02de01000112","5e59362ee60e02de0100011c","5e593e19e60e02de010001b2","5e59534af5822adf01000143","5e59f4e56ef27ddf01000069","5e5b465d74a3e7df010000a7","5e5c98039933dade01000060","5e5de97f863279df01000090","5e5e4ae6d3955dde01000192","5e5f3ad89771b7de010000a2","5e608c731fc211de0100007f","5e61de0a0c95fdde0100006d","5e632f86716092de0100005f","5e648115e89ef4df01000050","5e65c480d92058de010000f6","5e6716ce511133df01000103","5e6867d6149172de010001dc","5e6901a58457bade01000228","5e69b8f123ebccde0100025b","5e6b0a6ad43119df0100000d","5nQfdBxq7KvcwZP9o","63MkeDCNpCZ4xssoD","64spuKvNJCT3aviq9","6HzGWAEoWkRqHpdjG","6WQ6JPYPAQyLBQRNd","6o3q8oBYeYiFwXL2a","6ubHGtRit8RABKw7Y","6vK2NdfCEfWSvKnDA","7HhHePLRJKGB3kBrd","7wK4pkZH6wggAzgQr","863dropDEquMxTCiH","8Z5uyf9PnbHciosNg","8k3fha2gmdGNpoZPk","8xaAoPWJ5foQmcb74","BQtnmXwD852T3QAhG","BZtPcwebBGFJkGgnd","BauJ9QFnez5kjX7Ha","Baud4v9K4Z7pyXERN","CGQ4XNPXx24BaCD8x","CToSjfW8undphvqSC","D94M3cLRFfSWmYDor","DBRu4P7HXzgmfDJrW","EGQqkqMPdk95vYTTN","ETC97ymK2H3jfkp5q","EfBf2N7N4y5YfeAL9","F5ktRFvTaDnZjj2A8","FMZRkadGwoo4erGXH","FutcmSS6iySoqmcyu","GC3ESQdqaLPtJ66N4","GC7xio9fNNvDh86xd","GLPXcuzwskk34ARcT","GXcxSCGXKyreEEggx","GZHaazRD88ZMJeQnk","GoG6JqboTcitzbf9M","GsMb4N4hDrqKf2hnH","HFeWfm5WnZTT8pnQD","Hh66omsP9joNwTgPL","Hhz9DHyqCZJkFTxmP","J42SJNuxphpHBtfd6","JSy3P5KaDYjgZ4uv8","JZzGKmvrSWDP7tiXR","K2WdeKrv7Sskqy7wc","KHzSeKiwivaCDfCsr","L2EErzRkXkgo8fw4d","MYauYMeXChHDgdTf2","PFDpj4AiHd6omJA44","PHqCexMSuM4nPEQxh","PS5oS73H9qaXDJpCk","PTStsjBRJTxTcbY3f","PYkMRKRh5Dz5hPq8j","Pf6vXNt4HiQMat8RB","PuGdX6yXPtt3htBpC","QsxKLrARWMgAJSsXt","R4zxD2hsQuk3EQ3Fj","R7myyt4pva8kYZagc","R8j8M9wY9q6ayGnSJ","Rk8sktqfTtga56wow","SMihS7kYndho3ASnT","ShvHbm8Pmh6fTY5X6","SvgecvneoWBEMkhA3","TLbEtdiZtCi8oM8yy","W2YaeDjAs8Dza8ZBt","W4owS3iF3uchp2LfA","WFoDSHHRdCWSEA9WG","WK9tdkuXbFq66vy9o","WpQFwqKLrSuLHrikh","XbKk7KT4i7zJgtqix","XhE8t8H7dpYYrwK3w","XpXp2FaKYaJtTfqiS","Y54mw2dgoEdip5xkN","YWJ6WKsLLzFJ742ER","YfjNjsHy4j2Mnv7Nk","YhJL7jdpQ8MatMFNn","ZnCLfqsBmAZvCWpr6","a3Dc2EXgLcj3xXMHF","afXJ9pvM5Mf5BJx83","avLcLadtR6kA2sQPA","axLnupWrPSKDEZY2b","b3rqk6uf5jauRjQqj","b6TxHMatZ9d83hrdM","bEjW6TADWN4XiCX52","bgT6nr8xYCDEMwbHy","bgjMHANwYDK2zKSiN","bkaYmdbuXtSfFkPLT","cFbQYiKBuZLyZ3fWm","cFrBBZbxKrgKccgdP","cJzEjtbdJxeCi2nTo","d64nQxohnsLajRQFk","dHpcMeRxpmxJRcREM","dQ5MeibqFvJfsE8Ye","e9imNi7DM7qASNKse","esHNP2upQGCyJWXAa","etQCABxLHx2dkZxcp","f9X4KvtJprKvL4WSa","faav2ZnwDeBTesycS","fm7Qxe9dMjij2rjnD","frtSh9tSy6oxYW3J3","g2EhKwTBpeKGjeBkE","g4ZFQwGEnHbG8sSDy","gjKhegeXXCPcn3DzM","gpZASzBbGSvPHuik9","gqgKaKNNta5gzrsmd","hveiSbSddmfyFTujr","hyX7REsvTYX36dufL","i3GYvt5MjAiEtknzo","i4eRrFKWNihy3zmQb","iDEb9EzSC2nRPDKR6","iQ5MFX5vAdjD9HvTu","iT5XaDr8k4ZrpiSAD","iaXZos4mKpFTbyY6d","idqfABFy38Rx4Q8Hk","iqN8j7hW3oH5SHv3W","isEfwdbJnm3zH7Ccz","j24MYZZPwJ2dRzi3Y","jQQtqgyqXNWFo8FY3","jTJAr6sLwZAKJMK4i","jWc3FfkQdaPfaZkqy","jasJsFMvgzHJ9QBQG","jd2kuxTzWSw4eeS85","jeMawkfjP9ffWTJxQ","jvBJxCposvFXANrTZ","jx3RpF7e37c9xgboP","k4LtFwugzxaqmwhii","k8bfJqqwy4PF5W33H","kZk3DSqq7WAvLEudX","m2RXghL6kKwXwwWbJ","m4mQHYXYG5N2Sw5C7","mG7RtktngDaqwvAPk","mgGYf728HmSoNgyZw","mnihmtwSKkFd2fPv7","n8q6NBDKxXkLc9Xpo","nFsZW6z5n5gTEDJvA","nWWyjjLdJMNhkJBeG","ngAkg46TkXE6nMA7G","oNkw2JfNQ7XcKuT92","oc9mQxK6GhzM6Sq7T","orsgoD5dFCtQJ6axK","pDbKk9JdfJ7XYNwR7","pQWbfkuYRhqC8do4x","pTQvF8KsbK27NFax4","q27gDsrzq5k36dKLH","q6SmZuoaPsLj3HQzH","r2dMcXBtDbY7YexDB","rxzDvRDoeNZ23kNsX","s59JYNTg84MsAMroM","szGdHiDjrWtX7SjjX","tL8kdjBv7wPP6goxZ","tjCfkb2eN6NWbxJqz","tjjThCsZ5yqyJKL9X","uLL7d7bYL9EJf9GdW","uSYyoQd3NzzwENXWt","uaFTDWw5KZ62HF8af","udTjQmC4bYCHZiEP9","uf7CQqT6ctkmBYHzn","uqTfAbFD4yrnrto68","ureTakRLhorfNgKsd","uwB96o63GXbzdTXeG","uyZvJDaWooatHuGJi","voccnn45PCnaTpWei","w9dbjheuRF8idEBhD","wFR4nDPwhwgTm7mBr","wJRFNWr359ghy6YGm","x5FBa5mkMyQrQqRiL","xDLn9CQWkFhPcYQZv","xPLafGGigneFehZzZ","xbWS5CyTHbRJs2Sti","yWnNsH77PibRrEcHG","ypxAHExitq3eTsnYg","z2nTxrYsgNoELTG6o","z4W58kgj4cQKjZ5q6","zG3kbvL5cYfyNeuhW","zS5nkbbuS2yvrvjdZ"],"author_short":["Wannipurage, D.","Marru, S.","Piece, M.","Abeysinghe, E.","Pamidighantam, S.","Christie, M.","Shenoy, G.","Dhamnaskar, A.","Jayathilaka, L."],"bibdata":{"title":"Implementing a Flexible, Fault Tolerant Job Management System for Science Gateways","type":"inproceedings","year":"2019","pages":"1-8","websites":"http://dl.acm.org/citation.cfm?doid=3332186.3332233","publisher":"ACM Press","city":"New York, New York, USA","id":"81ea3925-59cb-30fb-8332-2af82619daec","created":"2019-10-01T17:21:29.424Z","accessed":"2019-09-12","file_attached":"true","profile_id":"42d295c0-0737-38d6-8b43-508cab6ea85d","last_modified":"2020-05-11T14:43:46.062Z","read":false,"starred":false,"authored":"true","confirmed":"true","hidden":false,"citation_key":"Wannipurage2019","private_publication":false,"abstract":"This paper summarizes our experiences evaluating and deploying a new task execution management system within the open source Apache Airavata framework for science gateways. We base our choices on our operational requirements and experiences running Airavata software as a multi-tenanted production service for multiple gateway clients. Our considerations include integrating semi-independent components, making major upgrades to those components while retaining the system's overall functionality, and choosing between integrating third party and in-house developed components. While we focus on Apache Airavata as the platform for evaluation, our results should be of general interest. After considering the options of extensions to our previous, in-house job management system using Apache Kafka or replacing it with Kubernetes, we ultimately chose Apache Helix, primarily for its ability to execute multiple tasks coupled into directed acyclic graphs. We have integrated this approach into Apache Airavata and have tested extensively over several months with many thousands of jobs, both from our internal throughput testing and operational tests with early adopter science gateway clients. The new system has proven to be at least as reliable as the previous system with the advantages that we now have simplified maintenance, do not need to support an in-house system that required extensive developer training to modify, and can support more sophisticated job execution scenarios.","bibtype":"inproceedings","author":"Wannipurage, Dimuthu and Marru, Suresh and Piece, Marlon and Abeysinghe, Eroma and Pamidighantam, Sudhakar and Christie, Marcus and Shenoy, Gourav and Dhamnaskar, Ajinkya and Jayathilaka, Lahiru","doi":"10.1145/3332186.3332233","booktitle":"Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning) - PEARC '19","bibtex":"@inproceedings{\n title = {Implementing a Flexible, Fault Tolerant Job Management System for Science Gateways},\n type = {inproceedings},\n year = {2019},\n pages = {1-8},\n websites = {http://dl.acm.org/citation.cfm?doid=3332186.3332233},\n publisher = {ACM Press},\n city = {New York, New York, USA},\n id = {81ea3925-59cb-30fb-8332-2af82619daec},\n created = {2019-10-01T17:21:29.424Z},\n accessed = {2019-09-12},\n file_attached = {true},\n profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},\n last_modified = {2020-05-11T14:43:46.062Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {true},\n hidden = {false},\n citation_key = {Wannipurage2019},\n private_publication = {false},\n abstract = {This paper summarizes our experiences evaluating and deploying a new task execution management system within the open source Apache Airavata framework for science gateways. We base our choices on our operational requirements and experiences running Airavata software as a multi-tenanted production service for multiple gateway clients. Our considerations include integrating semi-independent components, making major upgrades to those components while retaining the system's overall functionality, and choosing between integrating third party and in-house developed components. While we focus on Apache Airavata as the platform for evaluation, our results should be of general interest. After considering the options of extensions to our previous, in-house job management system using Apache Kafka or replacing it with Kubernetes, we ultimately chose Apache Helix, primarily for its ability to execute multiple tasks coupled into directed acyclic graphs. We have integrated this approach into Apache Airavata and have tested extensively over several months with many thousands of jobs, both from our internal throughput testing and operational tests with early adopter science gateway clients. The new system has proven to be at least as reliable as the previous system with the advantages that we now have simplified maintenance, do not need to support an in-house system that required extensive developer training to modify, and can support more sophisticated job execution scenarios.},\n bibtype = {inproceedings},\n author = {Wannipurage, Dimuthu and Marru, Suresh and Piece, Marlon and Abeysinghe, Eroma and Pamidighantam, Sudhakar and Christie, Marcus and Shenoy, Gourav and Dhamnaskar, Ajinkya and Jayathilaka, Lahiru},\n doi = {10.1145/3332186.3332233},\n booktitle = {Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning) - PEARC '19}\n}","author_short":["Wannipurage, D.","Marru, S.","Piece, M.","Abeysinghe, E.","Pamidighantam, S.","Christie, M.","Shenoy, G.","Dhamnaskar, A.","Jayathilaka, L."],"urls":{"Paper":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d/file/55631bf4-357b-5a3f-5b2f-a306c37a7462/Wannipurage_et_al___2019___Implementing_a_Flexible_Fault_Tolerant_Job_Management_System_for_Science_Gateways3.pdf.pdf","Website":"http://dl.acm.org/citation.cfm?doid=3332186.3332233"},"biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","bibbaseid":"wannipurage-marru-piece-abeysinghe-pamidighantam-christie-shenoy-dhamnaskar-etal-implementingaflexiblefaulttolerantjobmanagementsystemforsciencegateways-2019","role":"author","metadata":{"authorlinks":{"pierce, m":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d/group/0e433c5b-85c4-32aa-851c-c145aac9f80f"}},"downloads":0},"bibtype":"inproceedings","creationDate":"2019-09-13T04:29:10.738Z","downloads":0,"keywords":[],"search_terms":["implementing","flexible","fault","tolerant","job","management","system","science","gateways","wannipurage","marru","piece","abeysinghe","pamidighantam","christie","shenoy","dhamnaskar","jayathilaka"],"title":"Implementing a Flexible, Fault Tolerant Job Management System for Science Gateways","year":2019,"biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","dataSources":["zgahneP4uAjKbudrQ","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}