ProvErr: System level statistical fault diagnosis using dependency model

ProvErr: System level statistical fault diagnosis using dependency model. Chen, P. & Plale, B. In Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015, 2015.
doi abstract bibtex

© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.

@inproceedings{
 title = {ProvErr: System level statistical fault diagnosis using dependency model},
 type = {inproceedings},
 year = {2015},
 id = {d97bfbf5-2463-3752-9dd8-376e89c033c6},
 created = {2019-10-01T17:20:59.446Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 last_modified = {2019-10-01T17:24:04.831Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Chen2015a},
 folder_uuids = {73f994b4-a3be-4035-a6dd-3802077ce863},
 private_publication = {false},
 abstract = {© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.},
 bibtype = {inproceedings},
 author = {Chen, P. and Plale, B.A.},
 doi = {10.1109/CCGrid.2015.86},
 booktitle = {Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015}
}

Downloads: 0

{"_id":"8CDgTga5A8WDufnMz","bibbaseid":"chen-plale-proverrsystemlevelstatisticalfaultdiagnosisusingdependencymodel-2015","downloads":0,"creationDate":"2018-03-12T19:10:27.442Z","title":"ProvErr: System level statistical fault diagnosis using dependency model","author_short":["Chen, P.","Plale, B."],"year":2015,"bibtype":"inproceedings","biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","bibdata":{"title":"ProvErr: System level statistical fault diagnosis using dependency model","type":"inproceedings","year":"2015","id":"d97bfbf5-2463-3752-9dd8-376e89c033c6","created":"2019-10-01T17:20:59.446Z","file_attached":false,"profile_id":"42d295c0-0737-38d6-8b43-508cab6ea85d","last_modified":"2019-10-01T17:24:04.831Z","read":false,"starred":false,"authored":"true","confirmed":"true","hidden":false,"citation_key":"Chen2015a","folder_uuids":"73f994b4-a3be-4035-a6dd-3802077ce863","private_publication":false,"abstract":"© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.","bibtype":"inproceedings","author":"Chen, P. and Plale, B.A.","doi":"10.1109/CCGrid.2015.86","booktitle":"Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015","bibtex":"@inproceedings{\n title = {ProvErr: System level statistical fault diagnosis using dependency model},\n type = {inproceedings},\n year = {2015},\n id = {d97bfbf5-2463-3752-9dd8-376e89c033c6},\n created = {2019-10-01T17:20:59.446Z},\n file_attached = {false},\n profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},\n last_modified = {2019-10-01T17:24:04.831Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {true},\n hidden = {false},\n citation_key = {Chen2015a},\n folder_uuids = {73f994b4-a3be-4035-a6dd-3802077ce863},\n private_publication = {false},\n abstract = {© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.},\n bibtype = {inproceedings},\n author = {Chen, P. and Plale, B.A.},\n doi = {10.1109/CCGrid.2015.86},\n booktitle = {Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015}\n}","author_short":["Chen, P.","Plale, B."],"biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","bibbaseid":"chen-plale-proverrsystemlevelstatisticalfaultdiagnosisusingdependencymodel-2015","role":"author","urls":{},"metadata":{"authorlinks":{}},"downloads":0},"search_terms":["proverr","system","level","statistical","fault","diagnosis","using","dependency","model","chen","plale"],"keywords":[],"authorIDs":[],"dataSources":["zgahneP4uAjKbudrQ","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}