ProvErr: System level statistical fault diagnosis using dependency model. Chen, P. & Plale, B. In Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015, 2015. doi abstract bibtex © 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.
@inproceedings{
title = {ProvErr: System level statistical fault diagnosis using dependency model},
type = {inproceedings},
year = {2015},
id = {d97bfbf5-2463-3752-9dd8-376e89c033c6},
created = {2019-10-01T17:20:59.446Z},
file_attached = {false},
profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
last_modified = {2019-10-01T17:24:04.831Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Chen2015a},
folder_uuids = {73f994b4-a3be-4035-a6dd-3802077ce863},
private_publication = {false},
abstract = {© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.},
bibtype = {inproceedings},
author = {Chen, P. and Plale, B.A.},
doi = {10.1109/CCGrid.2015.86},
booktitle = {Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015}
}