ProvErr: System level statistical fault diagnosis using dependency model. Chen, P. & Plale, B. In Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015, 2015.
doi  abstract   bibtex   
© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.
@inproceedings{
 title = {ProvErr: System level statistical fault diagnosis using dependency model},
 type = {inproceedings},
 year = {2015},
 id = {d97bfbf5-2463-3752-9dd8-376e89c033c6},
 created = {2019-10-01T17:20:59.446Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 last_modified = {2019-10-01T17:24:04.831Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Chen2015a},
 folder_uuids = {73f994b4-a3be-4035-a6dd-3802077ce863},
 private_publication = {false},
 abstract = {© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.},
 bibtype = {inproceedings},
 author = {Chen, P. and Plale, B.A.},
 doi = {10.1109/CCGrid.2015.86},
 booktitle = {Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015}
}

Downloads: 0