Crossing analytics systems: A case for integrated provenance in data lakes. Suriarachchi, I. & Plale, B., A. In Proceedings of the 2016 IEEE 12th International Conference on e-Science, e-Science 2016, 2017.
doi  abstract   bibtex   
© 2016 IEEE. The volumes of data in Big Data, their variety and unstructured nature, have had researchers looking beyond the data warehouse. The data warehouse, among other features, requires mapping data to a schema upon ingest, an approach seen as inflexible for the massive variety of Big Data. The Data Lake is emerging as an alternate solution for storing data of widely divergent types and scales. Designed for high flexibility, the Data Lake follows a schema-on-read philosophy and data transformations are assumed to be performed within the Data Lake. During its lifecycle in a Data Lake, a data product may undergo numerous transformations performed by any number of Big Data processing engines leading to questions of traceability. In this paper we argue that provenance contributes to easier data management and traceability within a Data Lake infrastructure. We discuss the challenges in provenance integration in a Data Lake and propose a reference architecture to overcome the challenges. We evaluate our architecture through a prototype implementation built using our distributed provenance collection tools.
@inproceedings{
 title = {Crossing analytics systems: A case for integrated provenance in data lakes},
 type = {inproceedings},
 year = {2017},
 id = {74645681-d99d-3b28-8d18-64c16a8cda43},
 created = {2018-03-05T18:20:23.177Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643},
 last_modified = {2018-09-28T15:32:32.664Z},
 read = {true},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Suriarachchi2017},
 private_publication = {false},
 abstract = {© 2016 IEEE. The volumes of data in Big Data, their variety and unstructured nature, have had researchers looking beyond the data warehouse. The data warehouse, among other features, requires mapping data to a schema upon ingest, an approach seen as inflexible for the massive variety of Big Data. The Data Lake is emerging as an alternate solution for storing data of widely divergent types and scales. Designed for high flexibility, the Data Lake follows a schema-on-read philosophy and data transformations are assumed to be performed within the Data Lake. During its lifecycle in a Data Lake, a data product may undergo numerous transformations performed by any number of Big Data processing engines leading to questions of traceability. In this paper we argue that provenance contributes to easier data management and traceability within a Data Lake infrastructure. We discuss the challenges in provenance integration in a Data Lake and propose a reference architecture to overcome the challenges. We evaluate our architecture through a prototype implementation built using our distributed provenance collection tools.},
 bibtype = {inproceedings},
 author = {Suriarachchi, Isuru and Plale, Beth A.},
 doi = {10.1109/eScience.2016.7870919},
 booktitle = {Proceedings of the 2016 IEEE 12th International Conference on e-Science, e-Science 2016}
}

Downloads: 0