SamzaSQL: Scalable fast data management with streaming SQL. Pathirage, M., Hyde, J., Pan, Y., & Plale, B. In Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016, 2016.
doi  abstract   bibtex   
© 2016 IEEE. As the data-driven economy evolves, enterprises have come to realize a competitive advantage in being able to act on high volume, high velocity streams of data. Technologies such as distributed message queues and streaming processing platforms that can scale to thousands of data stream partitions on commodity hardware are a response. However, the programming API provided by these systems is often low-level, requiring substantial custom code that adds to the programmer learning curve and maintenance overhead. Additionally, these systems often lack SQL querying capabilities that have proven popular on Big Data systems like Hive, Impala or Presto. We define a minimal set of extensions to standard SQL for data stream querying and manipulation. These extensions are prototyped in SamzaSQL, a new tool for streaming SQL that compiles streaming SQL into physical plans that are executed on Samza, an open-source distributed stream processing framework. We compare the performance of streaming SQL queries against native Samza applications and discuss usability improvements. SamzaSQL is a part of the open source Apache Samza project and will be available for general use.
@inproceedings{
 title = {SamzaSQL: Scalable fast data management with streaming SQL},
 type = {inproceedings},
 year = {2016},
 id = {3a755f75-a3e8-3cd5-b566-9337e3304e92},
 created = {2018-03-05T18:20:25.766Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643},
 last_modified = {2018-03-05T18:20:25.766Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 citation_key = {Pathirage2016},
 private_publication = {false},
 abstract = {© 2016 IEEE. As the data-driven economy evolves, enterprises have come to realize a competitive advantage in being able to act on high volume, high velocity streams of data. Technologies such as distributed message queues and streaming processing platforms that can scale to thousands of data stream partitions on commodity hardware are a response. However, the programming API provided by these systems is often low-level, requiring substantial custom code that adds to the programmer learning curve and maintenance overhead. Additionally, these systems often lack SQL querying capabilities that have proven popular on Big Data systems like Hive, Impala or Presto. We define a minimal set of extensions to standard SQL for data stream querying and manipulation. These extensions are prototyped in SamzaSQL, a new tool for streaming SQL that compiles streaming SQL into physical plans that are executed on Samza, an open-source distributed stream processing framework. We compare the performance of streaming SQL queries against native Samza applications and discuss usability improvements. SamzaSQL is a part of the open source Apache Samza project and will be available for general use.},
 bibtype = {inproceedings},
 author = {Pathirage, M. and Hyde, J. and Pan, Y. and Plale, B.},
 doi = {10.1109/IPDPSW.2016.141},
 booktitle = {Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016}
}

Downloads: 0