Milieu: Lightweight and configurable big data provenance for science. Cheah, Y., Canon, R., Plale, B., & Ramakrishnan, L. In Proceedings - 2013 IEEE International Congress on Big Data, BigData 2013, 2013.
doi  abstract   bibtex   
The volume and complexity of data produced and analyzed in scientific collaborations is growing exponentially. It is important to track scientific data-intensive analysis workflows to provide context and reproducibility as data is transformed in these collaborations. Provenance addresses this need and aids scientists by providing the lineage or history of how data is generated, used and modified. Provenance has traditionally been collected at the workflow level often making it hard to capture relevant information about resource characteristics and is difficult for users to easily incorporate in existing workflows. In this paper, we describe Milieu, a framework focused on the collection of provenance for scientific experiments in High Performance Computing systems. Our approach collects provenance in a minimally intrusive way without significantly impacting the performance of the execution of scientific workflows. We also provide fidelity to our provenance collection by allowing users to specify three levels of provenance collection. We evaluate our framework on systems at the National Energy Research Scientific Computing Center (NERSC) and show that the overhead is less than the variation already experienced by these applications in these shared environments. © 2013 IEEE.
@inproceedings{
 title = {Milieu: Lightweight and configurable big data provenance for science},
 type = {inproceedings},
 year = {2013},
 id = {a77cbe47-f359-3e85-987b-c571e4971ab9},
 created = {2019-10-01T17:20:45.360Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 last_modified = {2019-10-01T17:23:14.244Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Cheah2013},
 folder_uuids = {73f994b4-a3be-4035-a6dd-3802077ce863},
 private_publication = {false},
 abstract = {The volume and complexity of data produced and analyzed in scientific collaborations is growing exponentially. It is important to track scientific data-intensive analysis workflows to provide context and reproducibility as data is transformed in these collaborations. Provenance addresses this need and aids scientists by providing the lineage or history of how data is generated, used and modified. Provenance has traditionally been collected at the workflow level often making it hard to capture relevant information about resource characteristics and is difficult for users to easily incorporate in existing workflows. In this paper, we describe Milieu, a framework focused on the collection of provenance for scientific experiments in High Performance Computing systems. Our approach collects provenance in a minimally intrusive way without significantly impacting the performance of the execution of scientific workflows. We also provide fidelity to our provenance collection by allowing users to specify three levels of provenance collection. We evaluate our framework on systems at the National Energy Research Scientific Computing Center (NERSC) and show that the overhead is less than the variation already experienced by these applications in these shared environments. © 2013 IEEE.},
 bibtype = {inproceedings},
 author = {Cheah, Y.-W. and Canon, R. and Plale, B. and Ramakrishnan, L.},
 doi = {10.1109/BigData.Congress.2013.16},
 booktitle = {Proceedings - 2013 IEEE International Congress on Big Data, BigData 2013}
}

Downloads: 0