A Computational Notebook Approach to Large-scale Text Analysis

A Computational Notebook Approach to Large-scale Text Analysis. Ruan, G., Gniady, T., Kloster, D., Wernert, E., & Tuna, E. In Proceedings of the Practice and Experience on Advanced Research Computing - PEARC '18, of PEARC '18, pages 1-8, 2018. ACM Press.

Website doi abstract bibtex

Large-scale text analysis algorithms are important to many fields as they interrogate reams of textual data to extract evidence, correlations, and trends not readily discoverable by a human reader. Unfortunately, there is often an expertise mismatch between computational researchers who have the technical and programming skills necessary to develop workflows at scale and domain scholars who have knowledge of the literary, historical, scientific, or social factors that can affect data as it is manipulated. Our work focuses on the use of scalable computational notebooks as a model to bridge the accessibility gap for domain scholars, putting the power of HPC resources directly in the hands of the researchers who have scholarly questions. The computational notebook approach offers many benefits, including: fine-grained control through modularized functions, interactive analysis that puts the "human in the loop", scalable analysis that leverages Spark-as-a-Service, and complexity hiding interfaces that minimize the need for HPC expertise. In addition, the notebook approach makes it easy to share, reproduce, and sustain research workflows. We illustrate the applicability of our approach with usage scenarios on HPC systems as well as within a restricted computing environment to access sensitive, in-copyright data, and demonstrate the usefulness of the notebook approach with three examples from three different domains and data sources. These sources include historical topic trends in ten thousand scientific articles, sentiment analysis of tweets, and literary analysis of the copyrighted works of Kurt Vonnegut using non-consumptive techniques.

@inproceedings{
 title = {A Computational Notebook Approach to Large-scale Text Analysis},
 type = {inproceedings},
 year = {2018},
 keywords = {HPC,Spark,computational notebook,interactive analysis,scalability,text analysis},
 pages = {1-8},
 websites = {http://doi.acm.org/10.1145/3219104.3219153,http://dl.acm.org/citation.cfm?doid=3219104.3219153},
 publisher = {ACM Press},
 city = {New York, New York, USA},
 series = {PEARC '18},
 id = {eaa29482-e732-3483-96e7-45c85711cd2e},
 created = {2019-10-01T17:20:43.081Z},
 file_attached = {false},
 profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},
 last_modified = {2019-10-01T17:20:43.081Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Ruan:2018:CNA:3219104.3219153},
 source_type = {inproceedings},
 private_publication = {false},
 abstract = {Large-scale text analysis algorithms are important to many fields as they interrogate reams of textual data to extract evidence, correlations, and trends not readily discoverable by a human reader. Unfortunately, there is often an expertise mismatch between computational researchers who have the technical and programming skills necessary to develop workflows at scale and domain scholars who have knowledge of the literary, historical, scientific, or social factors that can affect data as it is manipulated. Our work focuses on the use of scalable computational notebooks as a model to bridge the accessibility gap for domain scholars, putting the power of HPC resources directly in the hands of the researchers who have scholarly questions. The computational notebook approach offers many benefits, including: fine-grained control through modularized functions, interactive analysis that puts the "human in the loop", scalable analysis that leverages Spark-as-a-Service, and complexity hiding interfaces that minimize the need for HPC expertise. In addition, the notebook approach makes it easy to share, reproduce, and sustain research workflows. We illustrate the applicability of our approach with usage scenarios on HPC systems as well as within a restricted computing environment to access sensitive, in-copyright data, and demonstrate the usefulness of the notebook approach with three examples from three different domains and data sources. These sources include historical topic trends in ten thousand scientific articles, sentiment analysis of tweets, and literary analysis of the copyrighted works of Kurt Vonnegut using non-consumptive techniques.},
 bibtype = {inproceedings},
 author = {Ruan, Guangchen and Gniady, Tassie and Kloster, David and Wernert, Eric and Tuna, Esen},
 doi = {10.1145/3219104.3219153},
 booktitle = {Proceedings of the Practice and Experience on Advanced Research Computing - PEARC '18}
}

Downloads: 0

{"_id":"eJCGmWTQShcEdNRwP","bibbaseid":"ruan-gniady-kloster-wernert-tuna-acomputationalnotebookapproachtolargescaletextanalysis-2018","authorIDs":[],"author_short":["Ruan, G.","Gniady, T.","Kloster, D.","Wernert, E.","Tuna, E."],"bibdata":{"title":"A Computational Notebook Approach to Large-scale Text Analysis","type":"inproceedings","year":"2018","keywords":"HPC,Spark,computational notebook,interactive analysis,scalability,text analysis","pages":"1-8","websites":"http://doi.acm.org/10.1145/3219104.3219153,http://dl.acm.org/citation.cfm?doid=3219104.3219153","publisher":"ACM Press","city":"New York, New York, USA","series":"PEARC '18","id":"eaa29482-e732-3483-96e7-45c85711cd2e","created":"2019-10-01T17:20:43.081Z","file_attached":false,"profile_id":"42d295c0-0737-38d6-8b43-508cab6ea85d","last_modified":"2019-10-01T17:20:43.081Z","read":false,"starred":false,"authored":"true","confirmed":"true","hidden":false,"citation_key":"Ruan:2018:CNA:3219104.3219153","source_type":"inproceedings","private_publication":false,"abstract":"Large-scale text analysis algorithms are important to many fields as they interrogate reams of textual data to extract evidence, correlations, and trends not readily discoverable by a human reader. Unfortunately, there is often an expertise mismatch between computational researchers who have the technical and programming skills necessary to develop workflows at scale and domain scholars who have knowledge of the literary, historical, scientific, or social factors that can affect data as it is manipulated. Our work focuses on the use of scalable computational notebooks as a model to bridge the accessibility gap for domain scholars, putting the power of HPC resources directly in the hands of the researchers who have scholarly questions. The computational notebook approach offers many benefits, including: fine-grained control through modularized functions, interactive analysis that puts the \"human in the loop\", scalable analysis that leverages Spark-as-a-Service, and complexity hiding interfaces that minimize the need for HPC expertise. In addition, the notebook approach makes it easy to share, reproduce, and sustain research workflows. We illustrate the applicability of our approach with usage scenarios on HPC systems as well as within a restricted computing environment to access sensitive, in-copyright data, and demonstrate the usefulness of the notebook approach with three examples from three different domains and data sources. These sources include historical topic trends in ten thousand scientific articles, sentiment analysis of tweets, and literary analysis of the copyrighted works of Kurt Vonnegut using non-consumptive techniques.","bibtype":"inproceedings","author":"Ruan, Guangchen and Gniady, Tassie and Kloster, David and Wernert, Eric and Tuna, Esen","doi":"10.1145/3219104.3219153","booktitle":"Proceedings of the Practice and Experience on Advanced Research Computing - PEARC '18","bibtex":"@inproceedings{\n title = {A Computational Notebook Approach to Large-scale Text Analysis},\n type = {inproceedings},\n year = {2018},\n keywords = {HPC,Spark,computational notebook,interactive analysis,scalability,text analysis},\n pages = {1-8},\n websites = {http://doi.acm.org/10.1145/3219104.3219153,http://dl.acm.org/citation.cfm?doid=3219104.3219153},\n publisher = {ACM Press},\n city = {New York, New York, USA},\n series = {PEARC '18},\n id = {eaa29482-e732-3483-96e7-45c85711cd2e},\n created = {2019-10-01T17:20:43.081Z},\n file_attached = {false},\n profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d},\n last_modified = {2019-10-01T17:20:43.081Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {true},\n hidden = {false},\n citation_key = {Ruan:2018:CNA:3219104.3219153},\n source_type = {inproceedings},\n private_publication = {false},\n abstract = {Large-scale text analysis algorithms are important to many fields as they interrogate reams of textual data to extract evidence, correlations, and trends not readily discoverable by a human reader. Unfortunately, there is often an expertise mismatch between computational researchers who have the technical and programming skills necessary to develop workflows at scale and domain scholars who have knowledge of the literary, historical, scientific, or social factors that can affect data as it is manipulated. Our work focuses on the use of scalable computational notebooks as a model to bridge the accessibility gap for domain scholars, putting the power of HPC resources directly in the hands of the researchers who have scholarly questions. The computational notebook approach offers many benefits, including: fine-grained control through modularized functions, interactive analysis that puts the \"human in the loop\", scalable analysis that leverages Spark-as-a-Service, and complexity hiding interfaces that minimize the need for HPC expertise. In addition, the notebook approach makes it easy to share, reproduce, and sustain research workflows. We illustrate the applicability of our approach with usage scenarios on HPC systems as well as within a restricted computing environment to access sensitive, in-copyright data, and demonstrate the usefulness of the notebook approach with three examples from three different domains and data sources. These sources include historical topic trends in ten thousand scientific articles, sentiment analysis of tweets, and literary analysis of the copyrighted works of Kurt Vonnegut using non-consumptive techniques.},\n bibtype = {inproceedings},\n author = {Ruan, Guangchen and Gniady, Tassie and Kloster, David and Wernert, Eric and Tuna, Esen},\n doi = {10.1145/3219104.3219153},\n booktitle = {Proceedings of the Practice and Experience on Advanced Research Computing - PEARC '18}\n}","author_short":["Ruan, G.","Gniady, T.","Kloster, D.","Wernert, E.","Tuna, E."],"urls":{"Website":"http://doi.acm.org/10.1145/3219104.3219153,http://dl.acm.org/citation.cfm?doid=3219104.3219153"},"biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","bibbaseid":"ruan-gniady-kloster-wernert-tuna-acomputationalnotebookapproachtolargescaletextanalysis-2018","role":"author","keyword":["HPC","Spark","computational notebook","interactive analysis","scalability","text analysis"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"inproceedings","creationDate":"2019-09-12T20:14:24.017Z","downloads":0,"keywords":["hpc","spark","computational notebook","interactive analysis","scalability","text analysis"],"search_terms":["computational","notebook","approach","large","scale","text","analysis","ruan","gniady","kloster","wernert","tuna"],"title":"A Computational Notebook Approach to Large-scale Text Analysis","year":2018,"biburl":"https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d","dataSources":["zgahneP4uAjKbudrQ","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}