<script src="https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d/group/9d761a94-2f2d-31ce-a8c3-50aa6d668643?jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d/group/9d761a94-2f2d-31ce-a8c3-50aa6d668643");
print_r($contents);
?>
<iframe src="https://bibbase.org/service/mendeley/42d295c0-0737-38d6-8b43-508cab6ea85d/group/9d761a94-2f2d-31ce-a8c3-50aa6d668643"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@inproceedings{ title = {What college students say, and what they do: Aligning self-regulated learning theory with behavioral logs}, type = {inproceedings}, year = {2020}, keywords = {LMS,Self-regulated learning,Self-reports,Trace data}, pages = {534-543}, websites = {https://dl.acm.org/doi/10.1145/3375462.3375516}, month = {3}, publisher = {Association for Computing Machinery}, day = {23}, city = {New York, NY, USA}, id = {47098c0f-9073-3ee4-a0ec-143f3b0cbf13}, created = {2020-04-21T22:56:55.588Z}, accessed = {2020-04-21}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-21T23:27:24.145Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {A central concern in learning analytics specifically and educational research more generally is the alignment of robust, coherent measures to well-developed conceptual and theoretical frameworks. Capturing and representing processes of learning remains an ongoing challenge in all areas of educational inquiry and presents substantive considerations on the nature of learning, knowledge, and assessment & measurement that have been continuously refined in various areas of education and pedagogical practice. Learning analytics as a still developing method of inquiry has yet to substantively navigate the alignment of measurement, capture, and representation of learning to theoretical frameworks despite being used to identify various practical concerns such as at risk students. This study seeks to address these concerns by comparing behavioral measurements from learning management systems to established measurements of components of learning as understood through self-regulated learning frameworks. Using several prominent and robustly supported self-reported survey measures designed to identify dimensions of self-regulated learning, as well as typical behavioral features extracted from a learning management system, we conducted descriptive and exploratory analyses on the relational structures of these data. With the exception of learners' selfreported time management strategies and level of motivation, the current results indicate that behavioral measures were not well correlated with survey measurements. Possibilities and recommendations for learning analytics as measurements for selfregulated learning are discussed. © 2020 Association for Computing Machinery.}, bibtype = {inproceedings}, author = {Quick, Joshua and Motz, Benjamin and Israel, Jamie and Kaetzel, Jason}, doi = {10.1145/3375462.3375516}, booktitle = {International Conference on Learning Analytics & Knowledge} }
@article{ title = {Self‐regulated studying behavior, and the social norms that influence it}, type = {article}, year = {2020}, pages = {10-21}, volume = {50}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1111/jasp.12637}, month = {1}, publisher = {Blackwell Publishing Ltd}, day = {6}, id = {953fa787-5f08-3693-9d79-2c1c52477979}, created = {2020-04-21T23:02:33.535Z}, accessed = {2020-04-21}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-21T23:02:33.596Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Teachers commonly use injunctive norms when telling students what they should be doing. But researchers find that sometimes descriptive norms, information about what others are actually doing, are more powerful influencers of behavior. In the present work, we examine which norm is more effective at increasing self-regulated studying and performance in an online college course across two semesters. To do this, we randomly assigned 751 undergraduate Introductory Psychology students to receive email messages at the start of every content unit that either contained descriptive norms, injunctive norms, information about the course, or a no message control. We found that injunctive norms increased study behaviors aimed at fulfilling course requirements (completion of assigned activities), but did not improve learning outcomes. Descriptive norms increased behaviors aimed at improving knowledge (ungraded practice with activities after they were due), and improved performance. These results suggest that norms more effectively influence behavior when there is a match, or a sense of fit, between the goal of the behavior (fulfilling course requirements vs. learning) and the pull of a stated norm (social approval vs. efficacy). We discuss these implications with respect to students' motivations for self-regulated studying behavior in contemporary learning environments, and the overall goals of education.}, bibtype = {article}, author = {Eyink, Julie R. and Motz, Benjamin A. and Heltzel, Gordon and Liddell, Torrin M.}, doi = {10.1111/jasp.12637}, journal = {Journal of Applied Social Psychology}, number = {1} }
@article{ title = {Intelligent systems for geosciences: An essential research agenda}, type = {article}, year = {2019}, pages = {76-84}, volume = {62}, month = {1}, publisher = {Association for Computing Machinery}, day = {1}, id = {93528b3f-15e7-3a5d-8a59-0d56846f9a6f}, created = {2019-08-15T21:20:24.954Z}, accessed = {2019-08-15}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-08-20T16:07:56.110Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {A research agenda for intelligent systems that will result in fundamental new capabilities for understanding the Earth system.}, bibtype = {article}, author = {Gil, Yolanda and Pierce, Suzanne A. and Babaie, Hassan and Banerjee, Arindam and Borne, Kirk and Bust, Gary and Cheatham, Michelle and Ebert-Uphoff, Imme and Gomes, Carla and Hill, Mary and Horel, John and Hsu, Leslie and Kinter, Jim and Knoblock, Craig and Krum, David and Kumar, Vipin and Lermusiaux, Pierre and Liu, Yan and North, Chris and Pankratius, Victor and Peters, Shanan and Plale, Beth and Pope, Allen and Ravela, Sai and Restrepo, Juan and Ridley, Aaron and Samet, Hanan and Shekhar, Shashi}, doi = {10.1145/3192335}, journal = {Communications of the ACM}, number = {1} }
@techreport{ title = {The Pervasive Technology Institute at 20: Two decades of success and counting}, type = {techreport}, year = {2019}, websites = {http://hdl.handle.net/2022/22607}, id = {2fbb25b6-79bb-32c5-bb8b-1314b428b250}, created = {2019-08-20T15:54:17.230Z}, accessed = {2019-08-20}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-10-01T17:56:36.710Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {techreport}, author = {Stewart, Craig A.; and Welch, Von; and Doak, Thomas G.; and Miller, Therese; and Plale, Beth; and Walsh, John A.; and Link, Matthew R.; and Snapp-Childs, Winona}, doi = {10.5967/QDF0-S837} }
@article{ title = {A High-Frequency Mobile Phone Data Collection Approach for Research in Social-Environmental Systems: Applications in Climate Variability and Food Security in Sub-Saharan Africa}, type = {article}, year = {2019}, websites = {https://www.sciencedirect.com/science/article/pii/S1364815218303207?via%3Dihub}, month = {5}, publisher = {Elsevier}, day = {20}, id = {1aa6346f-7f1d-36b1-a787-314f1b1b62ca}, created = {2019-08-27T21:24:05.621Z}, accessed = {2019-05-20}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-08-27T21:24:05.621Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Collecting high-frequency social-environmental data about farming practices in sub-Saharan Africa can provide new insight into environmental changes that farmers face and how they respond within smallholder agro-ecosystems. Traditional data collection methods such as agricultural censuses are costly and not useful for understanding intra-annual and real-time decisions. Short-message service (SMS) has the potential to transform the nature of data collection in coupled social-ecological systems. We present a system for collecting, managing, and synthesizing weekly data from farmers, including data infrastructure for management of big and heterogeneous datasets; probabilistic data quality assessment tools; and visualization and analysis tools such as mapping and regression techniques. We discuss limitations of collecting social-environmental data via SMS and data integration challenges that arise when linking these data with other social and environmental data. In combination with high-frequency environmental data, such data will help ameliorate issues of scale mismatch and build resilience in environmental systems.}, bibtype = {article}, author = {Giroux, Stacey A. and Kouper, Inna and Estes, Lyndon D. and Schumacher, Jacob and Waldman, Kurt and Greenshields, Joel T. and Dickinson, Stephanie L. and Caylor, Kelly K. and Evans, Tom P.}, doi = {10.1016/J.ENVSOFT.2019.05.011}, journal = {Environmental Modelling & Software} }
@article{ title = {Safe Open Science for Restricted Data}, type = {article}, year = {2019}, pages = {50-60}, volume = {3}, websites = {https://content.sciendo.com/view/journals/dim/3/1/article-p50.xml}, id = {1861ec22-676e-3d54-9018-0e35e5014974}, created = {2019-08-27T21:24:05.622Z}, accessed = {2019-06-16}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-08-27T21:24:05.622Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Plale, Beth A and Dickson, Eleanor and Kouper, Inna and Liyanage, Samitha and Ma, Yu and McDonald, Robert H. and Walsh, John A and Withana, Sachith}, doi = {10.2478/dim-2019-0005}, journal = {Data and Information Management}, number = {1} }
@article{ title = {ManyClasses 1: Assessing the generalizable effect of immediate versus delayed feedback across many college classes}, type = {article}, year = {2019}, keywords = {Cognitive Psychology,Educational Psychology,Social and Behavioral Sciences}, publisher = {PsyArXiv}, id = {b66b0972-9f47-3c29-be0c-7e6c2ede9f68}, created = {2020-04-21T23:07:22.691Z}, accessed = {2020-04-21}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-21T23:10:33.714Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Psychology researchers have long attempted to identify educational practices that improve student learning. However, experimental research on these practices is often conducted in laboratory contexts or in a single class, threatening the external validity of the results. In this paper, we establish an experimental paradigm for evaluating the benefits of recommended practices across a variety of authentic educational contexts – a model we call ManyClasses. The core feature is that researchers examine the same research question and measure the same experimental effect across many classes spanning a range of topics, institutions, teacher implementations, and student populations. We report the first ManyClasses study, which examined how the timing of feedback on class assignments, either immediate or delayed by a few days, affected subsequent performance on class assessments. Across XX classes, [summarize effect of feedback timing, including key moderators]. More broadly, these findings provide evidence regarding the feasibility of conducting within-class randomized experiments across a range of naturally occurring learning environments.}, bibtype = {article}, author = {Fyfe, Emily and Leeuw, Joshua de and Carvalho, Paulo and Goldstone, Robert and Motz, Benjamin}, doi = {10.31234/OSF.IO/4MVYH}, journal = {Advances in Methods and Practices in Psychological Science} }
@inproceedings{ title = {The validity and utility of activity logs as a measure of student engagement}, type = {inproceedings}, year = {2019}, keywords = {LMS,Student engagement,Trace data,Web logs}, pages = {300-309}, websites = {http://dl.acm.org/citation.cfm?doid=3303772.3303789}, month = {3}, publisher = {Association for Computing Machinery}, day = {4}, city = {New York, New York, USA}, id = {00e4a61a-7762-3028-be9d-43e0a415be9f}, created = {2020-04-21T23:21:05.782Z}, accessed = {2020-04-21}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-21T23:25:35.319Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Learning management system (LMS) web logs provide granular, near-real-time records of student behavior as learners interact with online course materials in digital learning environments. However, it remains unclear whether LMS activity indeed reflects behavioral properties of student engagement, and it also remains unclear how to deal with variability in LMS usage across a diversity of courses. In this study, we evaluate whether instructors' subjective ratings of their students' engagement are related to features of LMS activity for 9,021 students enrolled in 473 for-credit courses. We find that estimators derived from LMS web logs are closely related to instructor ratings of engagement, however, we also observe that there is not a single generic relationship between activity and engagement, and what constitutes the behavioral components of “engagement” will be contingent on course structure. However, for many of these courses, modeled engagement scores are comparable to instructors' ratings in their sensitivity for predicting academic performance. As long as they are tuned to the differences between courses, activity indices from LMS web logs can provide a valid and useful proxy measure of student engagement.}, bibtype = {inproceedings}, author = {Motz, Benjamin and Quick, Joshua and Schroeder, Noah and Zook, Jordon and Gunkel, Matthew}, doi = {10.1145/3303772.3303789}, booktitle = {Proceedings of the 9th International Conference on Learning Analytics & Knowledge} }
@inproceedings{ title = {Transparency by Design in eScience Research}, type = {inproceedings}, year = {2019}, pages = {428-431}, month = {3}, publisher = {Institute of Electrical and Electronics Engineers (IEEE)}, day = {20}, id = {fabf7581-3c69-32b2-9ebd-734e492cd74e}, created = {2020-04-21T23:50:40.171Z}, accessed = {2020-04-21}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-21T23:52:20.401Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Both the landscape of eScience research and the environment in which the research is conducted are undergoing change. Transparency by design in eScience is proposed as a term to describe transparency in eScience practices, processes, methodologies, and research results. We break down different aspects of transparency and urge the eScience community towards a renewed commitment to scientific rigor because of the important role that we as scientists have to improve society and protect the good will that society has bestowed on science.}, bibtype = {inproceedings}, author = {Plale, Beth}, doi = {10.1109/escience.2019.00055}, booktitle = {International Conference on eScience (eScience)} }
@article{ title = {Rice Galaxy: an open resource for plant science}, type = {article}, year = {2019}, pages = {1-14}, volume = {8}, websites = {http://orcid.org/0000-0001-8512-144X}, id = {acb75a15-7240-389f-b686-cd37a7da90da}, created = {2020-04-22T00:00:15.272Z}, accessed = {2020-04-21}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-22T00:00:15.388Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Background: Rice molecular genetics, breeding, genetic diversity, and allied research (such as rice-pathogen interaction) have adopted sequencing technologies and high-density genotyping platforms for genome variation analysis and gene discovery. Germplasm collections representing rice diversity, improved varieties, and elite breeding materials are accessible through rice gene banks for use in research and breeding, with many having genome sequences and high-density genotype data available. Combining phenotypic and genotypic information on these accessions enables genome-wide association analysis, which is driving quantitative trait loci discovery and molecular marker development. Comparative sequence analyses across quantitative trait loci regions facilitate the discovery of novel alleles. Analyses involving DNA sequences and large genotyping matrices for thousands of samples, however, pose a challenge to non−computer savvy rice researchers. Findings: The Rice Galaxy resource has shared datasets that include high-density genotypes from the 3,000 Rice Genomes project and sequences with corresponding annotations from 9 published rice genomes. The Rice Galaxy web server and deployment installer includes tools for designing single-nucleotide polymorphism assays, analyzing genome-wide association studies, population diversity, rice−bacterial pathogen diagnostics, and a suite of published genomic prediction methods.}, bibtype = {article}, author = {Juanillas, Venice and Dereeper, Alexis and Beaume, Nicolas and Droc, Gaetan and Dizon, Joshua and Mendoza, John Robert and Perdon, Jon Peter and Mansueto, Locedie and Triplett, Lindsay and Lang, Jillian and Zhou, Gabriel and Ratharanjan, Kunalan and Plale, Beth and Haga, Jason and Leach, Jan E and Ruiz, Manuel and Thomson, Michael and Alexandrov, Nickolai and Larmande, Pierre and Kretzschmar, Tobias and Mauleon, Ramil P}, doi = {10.1093/gigascience/giz028} }
@article{ title = {Cyberinfrastructure Requirements to Enhance Multi-messenger Astrophysics}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1903.04590}, month = {3}, day = {11}, id = {59dc711a-b9b1-3470-bb15-03ad5f6706f8}, created = {2020-04-22T20:39:42.171Z}, accessed = {2020-04-22}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-23T05:38:23.224Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {The identification of the electromagnetic counterpart of the gravitational wave event, GW170817, and discovery of neutrinos and gamma-rays from TXS 0506+056 heralded the new era of multi-messenger astrophysics. As the number of multi-messenger events rapidly grow over the next decade, the cyberinfrastructure requirements to handle the increase in data rates, data volume, need for event follow up, and analysis across the different messengers will also explosively grow. The cyberinfrastructure requirements to enhance multi-messenger astrophysics will both be a major challenge and opportunity for astronomers, physicists, computer scientists and cyberinfrastructure specialists. Here we outline some of these requirements and argue for a distributed cyberinfrastructure institute for multi-messenger astrophysics to meet these challenges.}, bibtype = {article}, author = {Chang, Philip and Allen, Gabrielle and Anderson, Warren and Bianco, Federica B. and Bloom, Joshua S. and Brady, Patrick R. and Brazier, Adam and Cenko, S. Bradley and Couch, Sean M. and DeYoung, Tyce and Deelman, Ewa and Etienne, Zachariah B and Foley, Ryan J. and Fox, Derek B and Golkhou, V. Zach and Grant, Darren R and Hanna, Chad and Holley-Bockelmann, Kelly and Howell, D. Andrew and Huerta, E. A. and Johnson, Margaret W. G. and Juric, Mario and Kaplan, David L. and Katz, Daniel S. and Keivani, Azadeh and Kerzendorf, Wolfgang and Kopper, Claudio and Lam, Michael T. and Lehner, Luis and Marka, Zsuzsa and Marka, Szabolcs and Nabrzyski, Jarek and Narayan, Gautham and O'Shea, Brian W. and Petravick, Donald and Quick, Rob and Street, Rachel A. and Taboada, Ignacio and Timmes, Frank and Turk, Matthew J. and Weltman, Amanda and Zhang, Zhao} }
@article{ title = {Rice Galaxy: an open resource for plant science}, type = {article}, year = {2018}, pages = {358754}, publisher = {Cold Spring Harbor Laboratory}, id = {ca765817-74d1-3a6a-b96c-b6496f453f45}, created = {2018-07-12T19:57:02.259Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-08-27T21:24:06.225Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Juanillas, Venice Margarette J and Dereeper, Alexis and Beaume, Nicolas and Droc, Gaetan and Dizon, Joshua and Mendoza, John Robert and Perdon, Jon Peter and Mansueto, Locedie and Triplett, Lindsay and Lang, Jillian}, journal = {bioRxiv} }
@inproceedings{ title = {Restricted data types used in secure computing environments}, type = {inproceedings}, year = {2018}, id = {e01ca5d0-5d7e-3f40-973b-18f40487f47d}, created = {2019-03-15T00:18:15.993Z}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-08-27T21:24:06.208Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kouper2018a}, private_publication = {false}, bibtype = {inproceedings}, author = {Kouper, Inna and Mitchell, Erik}, booktitle = {HathiTrust Research Center UnCamp 2018} }
@inproceedings{ title = {Data Capsule Appliance for Restricted Data in Libraries}, type = {inproceedings}, year = {2018}, websites = {https://www.tacc.utexas.edu/documents/1084364/1627230/06_Plale-DC-IMLS-CMD18.pdf/792ac021-b8b8-432d-aceb-4a8d8c9a6dac}, city = {Fort Worth, TX}, id = {8f2c09b5-8ef3-388f-8741-cc78db0e8d1e}, created = {2019-03-15T00:18:15.995Z}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-03-15T00:18:15.995Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Withana2018}, private_publication = {false}, bibtype = {inproceedings}, author = {Withana, Sachith and Kouper, Inna and Plale, Beth A.}, booktitle = {Workshop on Cyberinfrastructure and Machine Learning for Digital Libraries and Archives, in conjunction with Joint Conference on Digital Libraries 2018} }
@inproceedings{ title = {Narrative visualization}, type = {inproceedings}, year = {2018}, websites = {https://www.researchgate.net/publication/325271444_Narrative_Visualization}, city = {Ames, Iowa}, id = {f0812d45-3842-36d8-89cf-64bb445eeab0}, created = {2019-03-15T00:18:16.020Z}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-03-15T00:18:16.020Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kouper2018}, private_publication = {false}, bibtype = {inproceedings}, author = {Kouper, Inna}, booktitle = {2018 Midwest Big Data Summer School} }
@inproceedings{ title = {Subject headings and beyond: Mapping the HathiTrust Digital Library content for wider use}, type = {inproceedings}, year = {2018}, websites = {https://osf.io/ak9u8/}, city = {Berkeley, CA}, id = {5b5cea19-2a01-3469-837c-670e2247d8e4}, created = {2019-03-15T00:18:16.025Z}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2019-08-27T21:24:06.202Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Edelblute2018}, private_publication = {false}, bibtype = {inproceedings}, author = {Edelblute, Trevor and Zoss, Angela and Kouper, Inna}, booktitle = {HathiTrust Research Center UnCamp 2018} }
@inproceedings{ title = {Big provenance stream processing for data intensive computations}, type = {inproceedings}, year = {2018}, keywords = {Big Data,Big Provenance,Stream Processing}, pages = {245-255}, month = {12}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {24}, id = {b4d438a8-4c7b-338f-a0f3-1d6fdb68e7c0}, created = {2020-04-21T23:45:55.171Z}, accessed = {2020-04-21}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-21T23:45:55.252Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {In the business and research landscape of today, data analysis consumes public and proprietary data from numerous sources, and utilizes any one or more of popular data-parallel frameworks such as Hadoop, Spark and Flink. In the Data Lake setting these frameworks co-exist. Our earlier work has shown that data provenance in Data Lakes can aid with both traceability and management. The sheer volume of fine-grained provenance generated in a multi-framework application motivates the need for on-the-fly provenance processing. We introduce a new parallel stream processing algorithm that reduces fine-grained provenance while preserving backward and forward provenance. The algorithm is resilient to provenance events arriving out-of-order. It is evaluated using several strategies for partitioning a provenance stream. The evaluation shows that the parallel algorithm performs well in processing out-of-order provenance streams, with good scalability and accuracy.}, bibtype = {inproceedings}, author = {Suriarachchi, Isuru and Withana, Sachith and Plale, Beth}, doi = {10.1109/eScience.2018.00039}, booktitle = {Proceedings - IEEE 14th International Conference on eScience, e-Science 2018} }
@inproceedings{ title = {Provenance Enriched PID Kernel Information as OAI-ORE Map Replacement for SEAD Research Objects}, type = {inproceedings}, year = {2017}, id = {a9ebbfcc-90f9-3663-859d-014c25489efa}, created = {2018-03-05T18:20:21.288Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.226Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kouper2017}, private_publication = {false}, abstract = {© 2017 IEEE. PIDs and PID Kernel Information, activities of the Research Data Alliance, have the potential to expand the utility and benefit of data provenance. The poster describes such expansion and outlines a study of the trade-offs of replacing the Research Object (RO) and OAI-ORE map solution of the SEAD publishing services with the PID Kernel Information approach.}, bibtype = {inproceedings}, author = {Kouper, I. and Luo, Y. and Suriarachchi, I. and Plale, B.}, doi = {10.1109/JCDL.2017.7991612}, booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries} }
@inproceedings{ title = {Low Latency Stream Processing: Twitter Heron with Infiniband and Omni-Path}, type = {inproceedings}, year = {2017}, id = {f1799452-aee7-3afb-bada-e4c67d145d59}, created = {2018-03-05T18:20:21.443Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.563Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kamburugamuve2017a}, source_type = {JOUR}, private_publication = {false}, bibtype = {inproceedings}, author = {Kamburugamuve, Supun and Ramasamy, Karthik and Swany, Martin and Fox, Geoffrey}, booktitle = {Proceedings of Strata Data Conference} }
@article{ title = {Identification and characterization of information-networks in long-tail data collections}, type = {article}, year = {2017}, volume = {94}, id = {52d331a3-496c-3e6c-baac-42a4eef1ab4a}, created = {2018-03-05T18:20:21.706Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.443Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Elag2017}, private_publication = {false}, abstract = {© 2017 Elsevier Ltd Scientists' ability to synthesize and reuse long-tail scientific data lags far behind their ability to collect and produce these data. Many Earth Science Cyberinfrastructures enable sharing and publishing their data over the web using metadata standards. While profiling data attributes advances the Linked Data approach, it has become clear that building information-networks among distributed data silos is essential to increase their integration and reusability. In this research, we developed a Long-Tail Information-Network (LTIN) model, which uses a metadata-driven approach to build semantic information-networks among datasets published over the web and aggregate them around environmental events. The model identifies and characterizes the spatial and temporal contextual association links and dependencies among datasets. This paper presents the design and application of the LTIN model, and an evaluation of its performance. The model capabilities were demonstrated by inferring the information-network of a stream discharge located at the downstream end of the Illinois River.}, bibtype = {article}, author = {Elag, M.M. and Kumar, P. and Marini, L. and Myers, J.D. and Hedstrom, M. and Plale, B.A.}, doi = {10.1016/j.envsoft.2017.03.032}, journal = {Environmental Modelling and Software} }
@inproceedings{ title = {Towards Publishing Secure Capsule-Based Analysis}, type = {inproceedings}, year = {2017}, id = {f7a838d4-9a25-3f80-92d3-431c489c3f67}, created = {2018-03-05T18:20:22.620Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.747Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Murdock2017}, private_publication = {false}, abstract = {© 2017 IEEE. Computational engagement with the HathiTrust Digital Library (HTDL) is confounded by the in- copyright status and licensing restrictions on the majority of the content. Because of these limitations, computational analysis on the HTDL must either be carried out in a secure environment or on derivative datasets. The HathiTrust Research Center (HTRC) Data Capsule service provides researchers with a secure environment through which they invoke tools that create, analyze, and export non-consumptive datasets. These derivative datasets, so long as they do not reproduce the full-text of the original work, are a transformative work protected by Fair Use provisions of United States Copyright Law, and can be published for reuse by other researchers, as the HTRC Extracted Features Dataset has been. Secure environments and derivative datasets enable researchers to engage with restricted data from focused studies of a few dozen volumes to large- scale experiments on millions of volumes. This paper describes advances in the Capsule service through a case study of how the HTRC Data Capsule service has advanced our activities on provenance, workflows, worksets, and non-consumptive exports through a topic modeling example. We also discuss the potential applications of this Capsule-based model to other digital libraries wrestling with research access and copyright restrictions.}, bibtype = {inproceedings}, author = {Murdock, J. and Jett, J. and Cole, T. and Ma, Y. and Downie, J.S. and Plale, B.}, doi = {10.1109/JCDL.2017.7991585}, booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries} }
@inproceedings{ title = {A hybrid approach to population construction for agricultural agent-based simulation}, type = {inproceedings}, year = {2017}, id = {051ca9e3-6149-3cef-9552-b03091af5da6}, created = {2018-03-05T18:20:22.771Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.392Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2017}, private_publication = {false}, abstract = {© 2016 IEEE. An Agent Based Model (ABM) is a powerful tool for its ability to represent heterogeneous agents which through their interactions can reveal emergent phenomena. For this to occur though, the set of agents in an ABM has to accurately model a real world population to reflect its heterogeneity. But when studying human behavior in less well developed settings, the availability of the real population data can be limited, making it impossible to create agents directly from the real population. In this paper, we propose a hybrid method to deal with this data scarcity: we first use the available real population data as the baseline to preserve the true heterogeneity, and fill in the missing characteristics based on survey and remote sensing datasets; then for the remaining undetermined agent characteristics, we use the Microbial Genetic Algorithm to search for a set of values that can optimize the replicative validity of the model to match data observed from real world. We apply our method to the creation of a synthetic population of household agents for the simulation of agricultural decision making processes in rural Zambia. The result shows that the synthetic population created from the farmer register can correctly reflect the marginal distributions and the randomness of survey data; and can minimize the difference between the distribution of simulated yield and that of the observed yield in Post Harvest Survey (PHS).}, bibtype = {inproceedings}, author = {Chen, P. and Evans, T. and Frisby, M. and Izquierdo, E. and Plale, B.}, doi = {10.1109/eScience.2016.7870914}, booktitle = {Proceedings of the 2016 IEEE 12th International Conference on e-Science, e-Science 2016} }
@inproceedings{ title = {Crossing analytics systems: A case for integrated provenance in data lakes}, type = {inproceedings}, year = {2017}, id = {74645681-d99d-3b28-8d18-64c16a8cda43}, created = {2018-03-05T18:20:23.177Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-09-28T15:32:32.664Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Suriarachchi2017}, private_publication = {false}, abstract = {© 2016 IEEE. The volumes of data in Big Data, their variety and unstructured nature, have had researchers looking beyond the data warehouse. The data warehouse, among other features, requires mapping data to a schema upon ingest, an approach seen as inflexible for the massive variety of Big Data. The Data Lake is emerging as an alternate solution for storing data of widely divergent types and scales. Designed for high flexibility, the Data Lake follows a schema-on-read philosophy and data transformations are assumed to be performed within the Data Lake. During its lifecycle in a Data Lake, a data product may undergo numerous transformations performed by any number of Big Data processing engines leading to questions of traceability. In this paper we argue that provenance contributes to easier data management and traceability within a Data Lake infrastructure. We discuss the challenges in provenance integration in a Data Lake and propose a reference architecture to overcome the challenges. We evaluate our architecture through a prototype implementation built using our distributed provenance collection tools.}, bibtype = {inproceedings}, author = {Suriarachchi, Isuru and Plale, Beth A.}, doi = {10.1109/eScience.2016.7870919}, booktitle = {Proceedings of the 2016 IEEE 12th International Conference on e-Science, e-Science 2016} }
@inbook{ type = {inbook}, year = {2017}, pages = {91-111}, publisher = {Elsevier}, id = {678babcd-3ca3-308b-a30d-e72dd58923d5}, created = {2018-03-05T18:20:23.440Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.776Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2017}, source_type = {CHAP}, private_publication = {false}, bibtype = {inbook}, author = {Plale, Beth and Kouper, Inna}, chapter = {The Centrality of Data: Data Lifecycle and Data Pipelines}, title = {Data Analytics for Intelligent Transportation Systems} }
@techreport{ title = {Indiana University Pervasive Technology Institute}, type = {techreport}, year = {2017}, city = {Bloomington, IN}, id = {05045f7d-6bbe-362b-b647-b12861e67840}, created = {2018-03-05T18:20:24.132Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.243Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Stewart2017a}, source_type = {misc}, private_publication = {false}, bibtype = {techreport}, author = {Stewart, C A and Welch, V and Plale, B and Fox, G and Pierce, M and Sterling, T}, doi = {10.5072/FK2154N14D} }
@inproceedings{ title = {Enhancing access to digital media: The language application grid in the HTRC Data Capsule}, type = {inproceedings}, year = {2017}, volume = {Part F1287}, id = {6a5a56bd-8c98-354e-9dec-4f5b5fea1d9c}, created = {2018-03-05T18:20:25.652Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.353Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Pustejovsky2017}, private_publication = {false}, abstract = {© 2017 held by the owner/author(s). The project "Workset Creation for Scholarly Analysis and Data Capsules" is building an infrastructure where researchers have access to text processing tools that can then be used on a copyrighted set of digital data. The infrastructure is built on (1) the HathiTrust Research Center (HTRC) Data Capsule services that can be used to access the HathiTrust Digital Library and (2) the language processing services of the Language Application (LAPPS Grid). The main thrust of the work presented here is the integration of the LAPPS Grid workflow infrastructure with the secure data access computing environment provided by the Data Capsules.}, bibtype = {inproceedings}, author = {Pustejovsky, J. and Verhagen, M. and Rim, K. and Ma, Y. and Ran, L. and Liyanage, S. and Murdock, J. and McDonald, R.H. and Plale, B.}, doi = {10.1145/3093338.3104171}, booktitle = {ACM International Conference Proceeding Series} }
@techreport{ title = {Pervasive Technology Institute Annual Report: Research Innovations and Advanced Cyberinfrastructure Services in Support of IU Strategic Goals During FY 2017}, type = {techreport}, year = {2017}, websites = {http://hdl.handle.net/2022/21809}, id = {b500ce80-bc36-39f9-a791-cc0d81cd7512}, created = {2018-03-05T18:20:25.928Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-09-10T00:01:46.564Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Stewart2017}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Stewart, Craig and Plale, Beth and Welch, Von and Pierce, Marlon and Fox, Geoffrey C and Doak, Thomas G and Hancock, David Y and Henschel, Robert and Link, Matthew R and Miller, Therese} }
@article{ title = {Offloading Collective Operations to Programmable Logic}, type = {article}, year = {2017}, volume = {37}, id = {1a0e7eb7-df4a-32d2-bfd9-618c224059fc}, created = {2018-03-05T18:20:26.238Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.461Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Arap2017}, private_publication = {false}, abstract = {© 1981-2012 IEEE. The authors describe their architecture and implementation for offloading collective operations to programmable logic in the communication substrate. Collective operations are widely used in parallel processing. Their design and implementation strategies affect the performance of many high-performance computing applications that utilize them. Collectives are central to the message passing interface (MPI) programming model. The programmable logic provided by field-programmable gate arrays (FPGAS) is a powerful option for creating task-specific logic to aid applications. The authors' approach is applicable in scenarios where there is programmable logic in the communication pipeline and can be used to accelerate various network-based operations. In this article, the authors present a general collective offloading framework for use in applications using MPI. They evaluate their approach on the Xilinx Zynq system on a chip and an FPGA-based network interface card called the NetFPGA. Results are presented both from microbenchmarks and a benchmark scientific application using MPI.}, bibtype = {article}, author = {Arap, O. and Brasilino, L.R.B. and Kissel, E. and Shroyer, A. and Swany, M.}, doi = {10.1109/MM.2017.3711654}, journal = {IEEE Micro}, number = {5} }
@inproceedings{ title = {Low latency stream processing: Apache Heron with Infiniband & Intel Omni-Path}, type = {inproceedings}, year = {2017}, id = {40409953-28cc-3f7f-a2ea-ece6db81668c}, created = {2018-03-05T18:20:27.217Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.603Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kamburugamuve2017}, source_type = {JOUR}, private_publication = {false}, bibtype = {inproceedings}, author = {Kamburugamuve, Supun and Ramasamy, Karthik and Swany, Martin and rey Fox, Geo}, doi = {10.1145/3147213.3147232}, booktitle = {UCC} }
@inproceedings{ title = {OSiRIS: a distributed Ceph deployment using software defined networking for multi-institutional research}, type = {inproceedings}, year = {2017}, pages = {62045}, volume = {898}, issue = {6}, publisher = {IOP Publishing}, id = {ae70b543-7bab-3406-8764-a1ff9cc5911c}, created = {2018-03-05T18:20:27.431Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.563Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {McKee2017}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {McKee, Shawn and Kissel, Ezra and Meekhof, Benjeman and Swany, Martin and Miller, Charles and Gregorowicz, Michael}, booktitle = {Journal of Physics: Conference Series} }
@article{ title = {Pacific Rim Applications and Grid Middleware Assembly (PRAGMA): International clouds for data science}, type = {article}, year = {2017}, volume = {29}, id = {afa4a466-54fa-38d0-8ae3-f5c9d14cad0d}, created = {2018-03-05T18:20:28.058Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-06-26T14:25:20.348Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2017a}, private_publication = {false}, bibtype = {article}, author = {Plale, B. and Chen, M.}, doi = {10.1002/cpe.4140}, journal = {Concurrency Computation}, number = {13} }
@article{ title = {Mining lake time series using symbolic representation}, type = {article}, year = {2017}, volume = {39}, id = {2979cfed-1c18-3689-a692-f12e74e5a67c}, created = {2018-03-05T18:20:29.779Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T14:06:36.584Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ruan2017}, private_publication = {false}, abstract = {© 2017 Elsevier B.V. Sensor networks deployed in lakes and reservoirs, when combined with simulation models and expert knowledge from the global community, are creating deeper understanding of the ecological dynamics of lakes. However, the amount of data and the complex patterns in the data demand substantial compute resources and efficient data mining algorithms, both of which are beyond the realm of traditional limnological research. This paper uniquely adapts methods from computer science for application to data intensive ecological questions, in order to provide ecologists with approachable methodology to facilitate knowledge discovery in lake ecology. We apply a state-of-the-art time series mining technique based on symbolic representation (SAX) to high-frequency time series of phycocyanin (PHYCO) and chlorophyll (CHLORO) fluorescence, both of which are indicators of algal biomass in lakes, as well as model predictions of algal biomass (MODEL). We use data mining techniques to demonstrate that MODEL predicts PHYCO better than it predicts CHLORO. All time series have high redundancy, resulting in a relatively small subset of unique patterns. However, MODEL is much less complex than either PHYCO or CHLORO and fails to reproduce high biomass periods indicative of algal blooms. We develop a set of tools in R to enable motif discovery and anomaly detection within a single lake time series, and relationship study among multiple lake time series through distance metrics, clustering and classification. Furthermore, to improve computation times, we provision web services to launch R tools remotely on high performance computing (HPC) resources. Comprehensive experimental results on observational and simulated lake data demonstrate the effectiveness of our approach.}, bibtype = {article}, author = {Ruan, G. and Hanson, P. C. and Dugan, H.A. and Plale, Beth A.}, doi = {10.1016/j.ecoinf.2017.03.001}, journal = {Ecological Informatics} }
@inproceedings{ title = {Data Capsule Appliance for Research Analysis of Restricted and Sensitive Data in Academic Libraries}, type = {inproceedings}, year = {2017}, websites = {https://www.cni.org/topics/special-collections/data-capsule-appliance-for-research-analysis-of-restricted-and-sensitive-data-in-academic-libraries}, city = {Washington, DC}, id = {888e07f2-4fef-39e9-89e9-9ed921ee5d01}, created = {2018-04-23T14:06:36.056Z}, accessed = {2018-01-12}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-05-14T00:25:21.318Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {McDonald2017}, private_publication = {false}, bibtype = {inproceedings}, author = {McDonald, R. H. and Mitchell, Erik and Unsworth, John and Kouper, Inna}, booktitle = {Coalition of Networked Information Fall Meeting CNI-2017} }
@article{ title = {One network metric datastore to track them all: the OSG network metric service}, type = {article}, year = {2017}, pages = {82044}, volume = {898}, id = {1f859c27-e21c-3a09-b5b0-4033a5cf0d9e}, created = {2020-04-22T20:18:25.303Z}, accessed = {2020-04-22}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-22T20:36:21.499Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {The Open Science Grid (OSG) relies upon the network as a critical part of the distributed infrastructures it enables. In 2012, OSG added a new focus area in networking with a goal of becoming the primary source of network information for its members and collaborators. This includes gathering, organizing, and providing network metrics to guarantee effective network usage and prompt detection and resolution of any network issues, including connection failures, congestion, and traffic routing. In September of 2015, this service was deployed into the OSG production environment. We will report on the creation, implementation, testing, and deployment of the OSG Networking Service. Starting from organizing the deployment of perfSONAR toolkits within OSG and its partners, to the challenges of orchestrating regular testing between sites, to reliably gathering the resulting network metrics and making them available for users, virtual organizations, and higher level services, all aspects of implementation will be reviewed. In particular, several higher-level services were developed to bring the OSG network service to its full potential. These include a web-based mesh configuration system, which allows central scheduling and management of all the network tests performed by the instances; a set of probes to continually gather metrics from the remote instances and publish it to different sources; a central network datastore (esmond), which provides interfaces to access the network monitoring information in close to real time and historically (up to a year) giving the state of the tests; and a perfSONAR infrastructure monitor system, ensuring the current perfSONAR instances are correctly configured and operating as intended. We will also describe the challenges we encountered in ongoing operations of the network service and how we have evolved our procedures to address those challenges. Finally we will describe our plans for future extensions and improvements to the service.}, bibtype = {article}, author = {Quick, Robert and Babik, Marian and Fajardo, Edgar M and Gross, Kyle and Hayashi, Soichi and Krenz, Marina and Lee, Thomas and Mckee, Shawn and Pipes, Christopher and Teige, Scott}, doi = {10.1088/1742-6596/898/8/082044}, journal = {Journal of Physics: Conf. Series} }
@inproceedings{ title = {Network-managed virtual global address space for message-driven runtimes}, type = {inproceedings}, year = {2016}, id = {2505c12f-55fb-3e97-93c4-88c18b342f65}, created = {2018-03-05T18:20:21.642Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.642Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kulkarni2016}, private_publication = {false}, abstract = {Copyright © 2016 by the Association for Computing Machinery, Inc. (ACM). Maintaining a scalable high-performance virtual global address space using distributed memory hardware has proven to be challenging. In this paper we evaluate a new approach for such an active global address space that leverages the capabilities of the network fabric to manage addressing, rather than software at the endpoint hosts. We describe our overall approach, design alternatives, and present initial experimental results that demonstrate the effectiveness and limitations of existing network hardware.}, bibtype = {inproceedings}, author = {Kulkarni, A. and Dalessandro, L. and Kissel, E. and Lumsdaine, A. and Sterling, T. and Swany, M.}, doi = {10.1145/2907294.2907320}, booktitle = {HPDC 2016 - Proceedings of the 25th ACM International Symposium on High-Performance Parallel and Distributed Computing} }
@inproceedings{ title = {A multi-tenant fair share approach to full-text search engine}, type = {inproceedings}, year = {2016}, pages = {45-50}, publisher = {IEEE Press}, id = {0acac974-ec92-3a8c-b36a-d40234025ac2}, created = {2018-03-05T18:20:21.918Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.918Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Peng2016}, source_type = {CONF}, private_publication = {false}, abstract = {© 2016 IEEE. Full text search engines underly the search of major content providers, Google, Bing and Yahoo. Open source search engines, such as Solr and ElasticSearch, are highly scalable and widely used in a Software-as-a-Service (SaaS) manner, in which multiple tenants share a single resource for improved resource utilization and lower management cost. Sharing of a full text search engine can exhibit unfairness in the form of performance interference. We propose a multi-tenancy solution that provides fair share of resource usage of a SaaS hosted search engine. It includes a revised deficit round robin technique for admission control, query resource usage estimation and a deadlock breaking mechanism. Experimental results show that our approach works well for both monolithic and distributed search engines.}, bibtype = {inproceedings}, author = {Peng, Zong and Plale, Beth}, doi = {10.1109/DataCloud.2016.010}, booktitle = {Proceedings of the 7th International Workshop on Data-Intensive Computing in the Cloud} }
@inproceedings{ title = {Rapid Monitoring of Drought Impacts on Small-Scale Farms in Africa through Integration of Farmer SMS data and Environmental Sensors}, type = {inproceedings}, year = {2016}, id = {57947046-75ba-3eb0-9efb-3efa6b530323}, created = {2018-03-05T18:20:22.029Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.029Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Evans2016}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Evans, T P and Caylor, K K and Estes, L D and Plale, B A and Attari, S and Waldman, K}, booktitle = {AGU Fall Meeting Abstracts} }
@book{ title = {Provenance as essential infrastructure for Data Lakes}, type = {book}, year = {2016}, source = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, volume = {9672}, id = {524dcbbf-4028-34df-b095-7746557f8ace}, created = {2018-03-05T18:20:22.187Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.187Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Suriarachchi2016b}, private_publication = {false}, abstract = {© Springer International Publishing Switzerland 2016. The Data Lake is emerging as a Big Data storage and management solution which can store any type of data at scale and execute data transformations for analysis. Higher flexibility in storage increases the risk of Data Lakes becoming data swamps. In this paper we show how provenance contributes to data management within a Data Lake infrastructure. We study provenance integration challenges and propose a reference architecture for provenance usage in a Data Lake. Finally we discuss the applicability of our tools in the proposed architecture.}, bibtype = {book}, author = {Suriarachchi, I. and Plale, B.}, doi = {10.1007/978-3-319-40593-3_16} }
@techreport{ title = {Pervasive Technology Institute Annual Report: Research Innovations and Advanced Cyberinfrastructure Services in Support of IU Strategic Goals During FY 2016}, type = {techreport}, year = {2016}, id = {66b7318c-d8f6-3e27-b724-b025fceca8d1}, created = {2018-03-05T18:20:22.556Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.556Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Stewart2016}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Stewart, Craig A and Plale, Beth and Welch, Von and Fox, Geoffrey C and Link, Matthew R and Miller, Therese and Wernert, Eric A and Boyles, Michael J and Fulton, Ben and Hancock, David Y} }
@inproceedings{ title = {Offloading collective operations to programmable logic on a Zynq cluster}, type = {inproceedings}, year = {2016}, id = {4e217229-dbd2-33d9-9a40-702eeafc6cd5}, created = {2018-03-05T18:20:22.738Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.738Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Arap2016}, private_publication = {false}, abstract = {© 2016 IEEE. This paper describes our architecture and implementation for offloading collective operations to programmable logic in the communication substrate. Collective operations - operations that involve communication between groups of co-operating processes - are widely used in parallel processing. The design and implementation strategies of collective operations plays a significant role in their performance and thus affects the performance of many high performance computing applications that utilize them. Collectives are central to the widely used Message Passing Interface (MPI) programming model. The programmable logic provided by FPGAs is a powerful option for creating task-specific logic to aid applications. While our work is evaluated on the Xilinx Zynq SoC, it is generally applicable in scenarios where there is programmable logic in the communication pipeline, including FPGAs on network interface cards like the NetFPGA or new systems like Intel's Xeon with on-die Altera FPGA resources. In this paper we have adapted and generalized our previous work in offloading collective operations to the NetFPGA. Here we present a general collective offloading framework for use in applications using the Message Passing Interface (MPI). The implementation is realized on the Xilinx Zynq reference platform, the Zedboard, using an Ethernet daughter card called EthernetFMC. Results from microbenchmarks are presented as well as from some scientific applications using MPI.}, bibtype = {inproceedings}, author = {Arap, O. and Swany, M.}, doi = {10.1109/HOTI.2016.024}, booktitle = {Proceedings - 2016 IEEE 24th Annual Symposium on High-Performance Interconnects, HOTI 2016} }
@inproceedings{ title = {KVLight: A Lightweight Key-Value Store for Distributed Access in Cloud}, type = {inproceedings}, year = {2016}, id = {677447d3-18bc-30af-a0c1-187131b13418}, created = {2018-03-05T18:20:22.856Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.856Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zeng2016}, private_publication = {false}, abstract = {© 2016 IEEE. Key-value stores (KVS) are finding use in Big Data applications as the store offers a flexible data model, scalability in number of distributed nodes, and high availability. In a cloud environment, a distributed KVS is often deployed over the local file system of the nodes in a cluster of virtual machines (VMs). Parallel file system (PFS) offers an alternate approach to disk storage, however a distributed key value store running over a parallel file system can experience overheads due to its unawareness of the PFS. Additionally, distributed KVS requires persistent running services which is not cost effective under the pay-as-you-go model of cloud computing because resources have to be held even under periods of no workload. We propose KVLight, a lightweight KVS that runs over PFS. It is lightweight in the sense that it shifts the responsibility of reliable data storage to the PFS and focuses on performance. Specifically, KVLight is built on an embedded KVS for high performance but uses novel data structures to support concurrent writes, giving capability that embedded KVSs are not currently designed for. Furthermore, it allows on-demand access without running persistent services in front of the file system. Empirical results show that KVLight outperforms Cassandra and Voldemort, two state-of-the-art KVSs, under both synthetic and realistic workloads.}, bibtype = {inproceedings}, author = {Zeng, J. and Plale, B.}, doi = {10.1109/CCGrid.2016.55}, booktitle = {Proceedings - 2016 16th IEEE/ACM International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2016} }
@inproceedings{ title = {Bongo: A BGP speaker built for defending against bad routes}, type = {inproceedings}, year = {2016}, id = {ef7c51da-99fc-3cc9-bddc-0015d0c7d891}, created = {2018-03-05T18:20:24.190Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.190Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Benton2016a}, private_publication = {false}, abstract = {© 2016 IEEE. Hijacks, outages, route leaks, and AS path spoofing are cases where network operators may want to influence the way routes are accepted and propagated from BGP neighbors in ways not supported by traditional BGP speakers. In this paper, we introduce Bongo, a software-based BGP speaker than can selectively filter out or extend the path of BGP updates received from other peers based on arbitrary operator-defined policies. Additionally, we show how the modularity of this system makes it easy to integrate with existing routers as well as other network devices such as OpenFlow switches or firewalls.}, bibtype = {inproceedings}, author = {Benton, K. and Camp, L.J. and Swany, M.}, doi = {10.1109/MILCOM.2016.7795416}, booktitle = {Proceedings - IEEE Military Communications Conference MILCOM} }
@inproceedings{ title = {Horme: Random access big data analytics}, type = {inproceedings}, year = {2016}, id = {42514b76-cb81-39ef-bf6d-bf3e950c6cdd}, created = {2018-03-05T18:20:25.036Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.036Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ruan2016}, private_publication = {false}, abstract = {© 2016 IEEE. MapReduce is a parallel framework which has been widely adopted for conducting large-scale data analytics. In cases where analysis of multiple millions of books must be analyzed using federally funded high performance computing (HPC) resources, the framework fails to port directly. We propose a solution that builds off of MapReduce for use on a HPC system that preserves the key-value semantics of map-reduce while supporting the random access of query access for subsetting Big Data datasets, and at same time hosting the service using the storage medium found in HPC architectures (parallel file systems) for reduced latencies. Experimental results demonstrate Horme's good performance in the HPC setting, with up to 41.4% faster than NoSQL based solution in random access scenario.}, bibtype = {inproceedings}, author = {Ruan, G. and Plale, B.}, doi = {10.1109/CLUSTER.2016.27}, booktitle = {Proceedings - IEEE International Conference on Cluster Computing, ICCC} }
@inproceedings{ title = {SamzaSQL: Scalable fast data management with streaming SQL}, type = {inproceedings}, year = {2016}, id = {3a755f75-a3e8-3cd5-b566-9337e3304e92}, created = {2018-03-05T18:20:25.766Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.766Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Pathirage2016}, private_publication = {false}, abstract = {© 2016 IEEE. As the data-driven economy evolves, enterprises have come to realize a competitive advantage in being able to act on high volume, high velocity streams of data. Technologies such as distributed message queues and streaming processing platforms that can scale to thousands of data stream partitions on commodity hardware are a response. However, the programming API provided by these systems is often low-level, requiring substantial custom code that adds to the programmer learning curve and maintenance overhead. Additionally, these systems often lack SQL querying capabilities that have proven popular on Big Data systems like Hive, Impala or Presto. We define a minimal set of extensions to standard SQL for data stream querying and manipulation. These extensions are prototyped in SamzaSQL, a new tool for streaming SQL that compiles streaming SQL into physical plans that are executed on Samza, an open-source distributed stream processing framework. We compare the performance of streaming SQL queries against native Samza applications and discuss usability improvements. SamzaSQL is a part of the open source Apache Samza project and will be available for general use.}, bibtype = {inproceedings}, author = {Pathirage, M. and Hyde, J. and Pan, Y. and Plale, B.}, doi = {10.1109/IPDPSW.2016.141}, booktitle = {Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016} }
@article{ title = {Argus: A Multi-tenancy NoSQL store with workload-aware resource reservation}, type = {article}, year = {2016}, volume = {58}, id = {d5c15f8a-5860-3550-8842-24f76ecdd25a}, created = {2018-03-05T18:20:26.062Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.062Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zeng2016a}, private_publication = {false}, abstract = {© 2016 Elsevier B.V. Multi-tenancy in cloud hosted NoSQL data stores is favored by cloud providers as it allows more effective resource sharing amongst different tenants thus lowering operating costs. A NoSQL provider will often present to each tenant a dedicated view of the store but then behind the scenes consolidate tenant access into a shared instance. This multi-tenancy approach with tenant data and workloads coexisting in the same infrastructure, under certain conditions can lead to performance degradation of one tenant caused by another as we show experimentally. This paper introduces Argus, a NoSQL store equipped with resource reservation to prevent performance interference across tenants in a multi-tenancy environment. Cache reservation is enforced through partitioning the cache space and disk reservation enforced through scheduling requests to a Distributed File System (DFS). We model the reservation on various workloads as a constrained optimization problem and use the stochastic hill climbing algorithm to find a near-optimum plan for different resource reservations. Empirical results show that Argus is able to prevent interference, adapt to dynamic workloads, and outperform A-Cache, another interference preventing NoSQL solution.}, bibtype = {article}, author = {Zeng, J. and Plale, B.}, doi = {10.1016/j.parco.2016.06.003}, journal = {Parallel Computing} }
@inproceedings{ title = {Photon: Remote memory access middleware for high-performance runtime systems}, type = {inproceedings}, year = {2016}, id = {47594965-f5e5-39bc-8cf7-de6d5395fdb8}, created = {2018-03-05T18:20:27.476Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.476Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kissel2016}, private_publication = {false}, abstract = {© 2016 IEEE. We introduce the Photon RDMA middleware librarythat enables consistent remote memory access semantics overa number of network interconnect technologies. A primarygoal of Photon is to expose a lightweight and flexible network abstraction that minimizes communication and message handlingoverheads for high-performance applications and runtime systems, in particular those that require the manipulation of objectswithin a global address space. Both one-sided and rendezvouscommunication models are supported and asynchronous networkprogress is exposed at a fine granularity. Photon implements anovel communication pattern called put-with-completion (PWC) that optimizes a completion notification path with variable sizedata for realizing active message-driven computation. The resultsof our performance evaluation show that our PWC model iscomparable, and often improves upon, existing one-sided RDMAlibraries in message latency and throughput metrics.}, bibtype = {inproceedings}, author = {Kissel, E. and Swany, M.}, doi = {10.1109/IPDPSW.2016.120}, booktitle = {Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016} }
@article{ title = {Filtering Source-Spoofed IP Traffic Using Feasible Path Reverse Path Forwarding with SDN}, type = {article}, year = {2016}, pages = {441}, volume = {5}, publisher = {IACSIT Press}, id = {b34d4b3b-dea0-322b-af2e-110ad9c6de6b}, created = {2018-03-05T18:20:27.480Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.480Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Benton2016}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Benton, Kevin and Camp, L Jean and Kelley, Tim and Swany, Martin}, journal = {International Journal of Computer and Communication Engineering}, number = {6} }
@article{ title = {Big Data at Scale for Digital Humanities: An Architecture for the}, type = {article}, year = {2016}, pages = {345}, publisher = {IGI Global}, id = {83240d35-dc57-3260-97c6-9b6e712d56be}, created = {2018-03-05T18:20:28.413Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.413Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kowalczyk2016}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Kowalczyk, Stacy T and Sun, Yiming and Peng, Zong and Plale, Beth and Willis, Craig and Zeng, Jiaan and Pathirage, Milinda and Liyanage, Samitha and Todd, Aaron and Ruan, Guangchen}, journal = {Big Data: Concepts, Methodologies, Tools, and Applications: Concepts, Methodologies, Tools, and Applications} }
@book{ title = {Analysis of memory constrained live provenance}, type = {book}, year = {2016}, source = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, volume = {9672}, id = {b5446728-4788-3362-a7bd-3fac51fd5e24}, created = {2018-03-05T18:20:28.577Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.577Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2016}, private_publication = {false}, abstract = {© Springer International Publishing Switzerland 2016. We conjecture that meaningful analysis of large-scale provenance can be preserved by analyzing provenance data in limited memory while the data is still in motion; that the provenance needs not be fully resident before analysis can occur. As a proof of concept, this paper defines a stream model for reasoning about provenance data in motion for Big Data provenance.We propose a novel streaming algorithm for the backward provenance query, and apply it to the live provenance captured from agent-based simulations. The performance test demonstrates high throughput, low latency and good scalability, in a distributed stream processing framework built on Apache Kafka and Spark Streaming.}, bibtype = {book}, author = {Chen, P. and Evans, T. and Plale, B.}, doi = {10.1007/978-3-319-40593-3_4} }
@misc{ title = {Evaluating Collectives in Networks of Multicore/Two-level Reduction}, type = {misc}, year = {2016}, id = {f414f699-3935-33f7-bd40-7ad6e35cd14d}, created = {2018-03-05T18:20:28.916Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.916Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wickramasinghe2016}, source_type = {JOUR}, private_publication = {false}, bibtype = {misc}, author = {Wickramasinghe, U S and D’Alessandro, Luke and Lumsdaine, Andrew and Kissel, Ezra and Swany, Martin and Newton, Ryan} }
@techreport{ title = {Provenance as Essential Infrastructure for Data Lakes [Preprint, forthcoming in IPAW 2016]}, type = {techreport}, year = {2016}, id = {8db5f1db-59c7-3c3b-88be-8a9efe54b396}, created = {2018-03-05T18:20:29.325Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.325Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Suriarachchi2016a}, source_type = {JOUR}, private_publication = {false}, bibtype = {techreport}, author = {Suriarachchi, Isuru and Plale, Beth} }
@inproceedings{ title = {Crossing analytics systems: A case for integrated provenance in data lakes}, type = {inproceedings}, year = {2016}, pages = {349-354}, publisher = {IEEE}, id = {38732ad8-66c3-3651-aea3-bbc7572c20e6}, created = {2018-03-05T18:20:29.392Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.392Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Suriarachchi2016}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Suriarachchi, Isuru and Plale, Beth}, booktitle = {e-Science (e-Science), 2016 IEEE 12th International Conference on} }
@inproceedings{ title = {Crossing analytics systems: A case for integrated provenance in data lakes}, type = {inproceedings}, year = {2016}, pages = {349-354}, websites = {http://ieeexplore.ieee.org/document/7870919/}, month = {10}, publisher = {IEEE}, id = {5d762368-530a-3083-ba39-5345b9687db2}, created = {2018-09-05T14:13:46.408Z}, accessed = {2018-09-05}, file_attached = {false}, profile_id = {0523a5c3-9e2c-38fe-8400-1789f459ee03}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-09-05T14:13:46.408Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {inproceedings}, author = {Suriarachchi, Isuru and Plale, Beth}, doi = {10.1109/eScience.2016.7870919}, booktitle = {2016 IEEE 12th International Conference on e-Science (e-Science)} }
@techreport{ title = {The Data Capsule for Non-Consumptive Research}, type = {techreport}, year = {2015}, id = {7dc9eb5a-ebf1-3c27-a21b-611dd7cd4a54}, created = {2018-03-05T18:20:23.088Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.088Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2015}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth and Prakash, Atul and McDonald, Robert} }
@techreport{ title = {Software in Science: a Report of Outcomes of the 2014 National Science Foundation Software Infrastructure for Sustained Innovation (SI2) Meeting}, type = {techreport}, year = {2015}, id = {58e9a68f-bd23-3755-9bc4-5aeab41a4091}, created = {2018-03-05T18:20:23.200Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.200Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2015a}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth and Jones, Matt and Thain, Douglas} }
@article{ title = {Komadu: A capture and visualization system for scientific data provenance}, type = {article}, year = {2015}, volume = {3}, publisher = {Ubiquity Press}, id = {2e560715-b56b-3f1b-8b0a-86a24d41a8d6}, created = {2018-03-05T18:20:23.756Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.756Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Suriarachchi2015}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Suriarachchi, Isuru and Zhou, Quan and Plale, Beth}, journal = {Journal of Open Research Software}, number = {1} }
@inproceedings{ title = {Parallel and quantitative sequential pattern mining for large-scale interval-based temporal data}, type = {inproceedings}, year = {2015}, id = {d5f1bad9-17f5-3abf-a766-5148951a9db5}, created = {2018-03-05T18:20:23.969Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.969Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ruan2015}, private_publication = {false}, abstract = {© 2014 IEEE. Mining frequent subsequences of patterns, or sequential pattern mining, has wide application in customer shopping sequence analysis, web log stream analysis, multi-modal behavioral studies, to name a few. To detect unknown, anomalous, and unexpected patterns from large-scale interval-based temporal data without complete a priori knowledge is challenging. In this paper, we present a framework - PESMiner which allows parallel and quantitative mining of sequential patterns at scale. Whereas most existing sequential mining algorithms can only find sequential orders of temporal events, our work presents a novel interactive temporal data mining algorithm capable of extracting precise temporal properties of sequential patterns. Furthermore, our work provides a unified parallel solution that scales our algorithms to larger temporal data sets by exploiting iterative MapReduce tasks. Comprehensive performance evaluations demonstrate that PESMiner significantly outperforms existing interval-based mining algorithms in terms of both quality (i.e. accuracy, precision, and recall) and scalability.}, bibtype = {inproceedings}, author = {Ruan, G. and Zhang, H. and Plale, B.}, doi = {10.1109/BigData.2014.7004410}, booktitle = {Proceedings - 2014 IEEE International Conference on Big Data, IEEE Big Data 2014} }
@inproceedings{ title = {Towards building a lightweight key-value store on parallel file system}, type = {inproceedings}, year = {2015}, volume = {2015-Octob}, id = {2ee4e722-34df-30ee-b5ad-859d150fc2f2}, created = {2018-03-05T18:20:24.174Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.174Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zeng2015a}, private_publication = {false}, abstract = {© 2015 IEEE. As data grows in number and size, big data applications begin to revolutionize the underlying storage system. On one hand, key-value store has prevailed as the back-end storage for big data applications owning to its schema-less data model, high scalability, and etc. On the other hand, parallel file system shared by multiple nodes offers large-capacity, high-throughput, as well as high-bandwidth access and is used widely in high performance computing (HPC) and cloud computing environments. In this paper, we explore the opportunity of building a lightweight key-value store that supports concurrent access over a parallel file system. The key-value store proposed relies on the sharing nature of parallel file system to provide distributed access. Instead of organizing a cluster of nodes with long running services to delegate the access, our key-value store simply embeds itself into applications and requires no long running services neither communication between nodes. Such a design not only simplifies the structure of a distributed key-value store but also avoids overhead introduced by having running services around the file system. We implemented a prototype of this system and compared it against Cassandra, a state-of-art key-value store. Preliminary results are promising.}, bibtype = {inproceedings}, author = {Zeng, J. and Plale, B.}, doi = {10.1109/CLUSTER.2015.100}, booktitle = {Proceedings - IEEE International Conference on Cluster Computing, ICCC} }
@inproceedings{ title = {Using phoebus data transfer accelerator in cloud environments}, type = {inproceedings}, year = {2015}, volume = {2015-Septe}, id = {6a6a479e-04a4-3150-91bc-ed4154bdf01f}, created = {2018-03-05T18:20:24.514Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.514Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zhang2015a}, private_publication = {false}, abstract = {© 2015 IEEE. The quality of data exchange in cloud computing applications relies on the connection performance between user clients and their cloud storage providers, and is often dependent on the wide area network (WAN) properties among data centers. For certain classes of applications, it can be crucial to provide an end-to-end solution that accelerates large data transfers and improves overall user experience.}, bibtype = {inproceedings}, author = {Zhang, M. and Kissel, E. and Swany, M.}, doi = {10.1109/ICC.2015.7248346}, booktitle = {IEEE International Conference on Communications} }
@inproceedings{ title = {New Approaches to Capture High Frequency Agricultural Dynamics in Africa through Mobile Phones}, type = {inproceedings}, year = {2015}, id = {3bfdd183-14a8-3dd4-b5d1-a7d1b932d6cc}, created = {2018-03-05T18:20:24.608Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.608Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Evans2015}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Evans, T P and Attari, S and Plale, B A and Caylor, K K and Estes, L D and Sheffield, J}, booktitle = {AGU Fall Meeting Abstracts} }
@inproceedings{ title = {Big data provenance analysis and visualization}, type = {inproceedings}, year = {2015}, id = {017b6c00-2a9a-38a1-949f-4c78c02be380}, created = {2018-03-05T18:20:24.680Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.680Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2015}, private_publication = {false}, abstract = {© 2015 IEEE. Provenance captured from E-Science experimentation is often large and complex, for instance, from agent-based simulations that have tens of thousands of heterogeneous components interacting over extended time periods. The subject of study of my dissertation is the use of E-Science provenance at scale. My initial research studied the visualization of large provenance graphs and proposed an abstract representation of provenance that supports useful data mining. Recent work involves analyzing large provenance data generated from agent-based simulations on a single machine. In continuation, I propose stream processing techniques to support the continuous and real-time analysis of data provenance, which is captured from agent based simulations on HPC and thus has unprecedented volume and complexity.}, bibtype = {inproceedings}, author = {Chen, P. and Plale, B.A.}, doi = {10.1109/CCGrid.2015.85}, booktitle = {Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015} }
@inproceedings{ title = {ProvErr: System level statistical fault diagnosis using dependency model}, type = {inproceedings}, year = {2015}, id = {8ade6e5a-af5c-328c-8bbb-63ca79a3e3c1}, created = {2018-03-05T18:20:25.018Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.018Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2015a}, private_publication = {false}, abstract = {© 2015 IEEE. Large-scale distributed systems are difficult to debug in the event of failure. Yet rapid fault diagnosis that pinpoints failures to the component level is critical to fast recovery. We introduce a statistical approach to fault diagnosis that utilizes a dependency graph of execution to automatically discover the most probable fault cause(s) at a component level (either software or hardware resource). This approach leverages engineers' high level understanding of the system and requires a very small amount of information compared to existing methods. It also utilizes dependency information to eliminate redundant causes while retaining co-causes. Experiments using Apache Pig show that our approach has good, robust performance for diagnosing software bugs and resource shortages, and scales nearly linearly as system size increases.}, bibtype = {inproceedings}, author = {Chen, P. and Plale, B.A.}, doi = {10.1109/CCGrid.2015.86}, booktitle = {Proceedings - 2015 IEEE/ACM 15th International Symposium on Cluster, Cloud, and Grid Computing, CCGrid 2015} }
@inproceedings{ title = {Trust threads: minimal provenance and data publication and reuse}, type = {inproceedings}, year = {2015}, publisher = {Colorado State University. Libraries}, id = {b679c212-7218-39ed-837c-5598026fd24c}, created = {2018-03-05T18:20:25.362Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.362Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2015b}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, Beth}, booktitle = {National Data Integrity Conference-2015} }
@inproceedings{ title = {HELM: Conflict-free active measurement scheduling for shared network resource management}, type = {inproceedings}, year = {2015}, id = {97d56a38-ebdf-367b-b3fa-17db2a0db130}, created = {2018-03-05T18:20:25.974Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.974Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zhang2015}, private_publication = {false}, abstract = {© 2015 IEEE. Network resource measurement is a key functionality for large scale network management. Intelligent, network-aware applications may benefit from access to detailed representations of network resources, including multi-layer topologies and real-time traffic measurement, and shared resources may obtain better overall utilization by identifying performance bottlenecks. In this study, we describe a network measurement framework, which includes a network topology analysis mechanism as well as agent tools for running active probes and collecting data from end hosts. The system includes a centralized coordinator, which abstracts network elements into annotated network graphs and applies scheduling algorithms to calculate conflict free measurement probes over shared links. Our evaluation integrated perfSONAR services into our framework and included deployment scenarios on research and education networks such as Internet2 and ESnet. The data presented in this study offers compelling evidence that supports a method by which to measure the performance of real world networks.}, bibtype = {inproceedings}, author = {Zhang, M. and Swany, M. and Yavanamanda, A. and Kissel, E.}, doi = {10.1109/INM.2015.7140283}, booktitle = {Proceedings of the 2015 IFIP/IEEE International Symposium on Integrated Network Management, IM 2015} }
@inproceedings{ title = {Filtering IP source spoofing using feasible path reverse path forwarding with SDN}, type = {inproceedings}, year = {2015}, id = {06b63eda-4427-33e4-838d-f26b306e5539}, created = {2018-03-05T18:20:27.473Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.473Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Benton2015}, private_publication = {false}, abstract = {© 2015 IEEE. Source IP spoofing is still an endemic challenge despite best practices documents being published more than 13 years ago that would prevent it if all ISPs abided by them. We argue that these approaches failed to gain widespread adoption due to fundamental incentive misalignment. We then propose an SDN-based solution designed to be placed at Internet exchange points by ISPs with the incentives to filter spoofed traffic. This solution doesn't replace existing routers and it runs on low-cost, high-speed OpenFlow switches with graceful failure strategies to make adoption simple.}, bibtype = {inproceedings}, author = {Benton, K. and Camp, L.J. and Kelley, T. and Swany, M.}, doi = {10.1109/CNS.2015.7346909}, booktitle = {2015 IEEE Conference on Communications and NetworkSecurity, CNS 2015} }
@inproceedings{ title = {Adaptive Recursive Doubling Algorithm for Collective Communication}, type = {inproceedings}, year = {2015}, id = {351744a4-22e2-393b-8df9-49e8ef2c7bf6}, created = {2018-03-05T18:20:27.758Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.758Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Arap2015}, private_publication = {false}, abstract = {© 2015 IEEE. Process arrival times at MPI collective operations differ significantly. Addressing this fact with special handling for popular collective communication algorithms can yield performance improvements. The recursive doubling algorithm is one of the most efficient techniques for implementing collectives in MPI, especially for short messages and when the number of participating processes is a power of two. In the recursive doubling algorithm, all the processes must complete a given step before the algorithm continues to the next step. In this paper, we present a recursive doubling algorithm that makes use of available data and removes the requirement for each process to arrive at each step before proceeding. Our approach makes use of the multicast feature of the underlying network and progress tagging of messages, describing the currently available partial results. Our approach could be implemented in any parallel execution environment that supports multicasting. Our prototype implementation is based upon a network interface card with an FPGA, the Net FPGA. The Net FPGA provides hardware level programmability to offload processing, precise and controlled timing for accounting for packet and algorithm behavior, allowing classification of skew scenarios. Our algorithm provides up to 10% saving in synchronization delay in the presence of skew and up to 37% saving in number of messages generated, and up to 32% saving in reduction operations performed in MPI Allreduce.}, bibtype = {inproceedings}, author = {Arap, O. and Swany, M. and Brown, G. and Himebaugh, B.}, doi = {10.1109/IPDPSW.2015.82}, booktitle = {Proceedings - 2015 IEEE 29th International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2015} }
@article{ title = {Research challenges in future multi-domain network performance measurement and monitoring}, type = {article}, year = {2015}, volume = {45}, id = {a9ae251e-f0fc-3267-b62f-552f7c9125e3}, created = {2018-03-05T18:20:27.860Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.860Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Calyam2015}, private_publication = {false}, abstract = {The perfSONAR-based Multi-domain Network Performance Measurement and Monitoring Workshop was held on Febru- Ary 20-21, 2014 in Arlington, VA. The goal of the workshop was to review the state of the perfSONAR effort and cat- Alyze future directions by cross-fertilizing ideas, and distill- ing common themes among the diverse perfSONAR stake- holders that include: network operators and managers, end- users and network researchers. The timing and organiza- Tion for the second workshop is significant because there are an increasing number of groups within NSF supported data-intensive computing and networking programs that are dealing with measurement, monitoring and troubleshoot- ing of multi-domain issues. These groups are forming ex- plicit measurement federations using perfSONAR to address a wide range of issues. In addition, the emergence and wide-adoption of new paradigms such as software-defined networking are taking shape to aid in traffic management needs of scientific communities and network operators. Con- sequently, there are new challenges that need to be addressed for extensible and programmable instrumentation, measure-ment data analysis, visualization and middleware security features in perfSONAR. This report summarizes the work- shop efforts to bring together diverse groups for delivering targeted short/long talks, sharing latest advances, and iden- Tifying gaps that exist in the community for solving end-to- end performance problems in an effctive, scalable fashion.}, bibtype = {article}, author = {Calyam, P. and Swany, M.}, journal = {Computer Communication Review}, number = {3} }
@book{ title = {Provenance and Annotation of Data and Processes: 5th International Provenance and Annotation Workshop, IPAW 2014, Cologne, Germany, June 9-13, 2014. Revised Selected Papers}, type = {book}, year = {2015}, volume = {8628}, publisher = {Springer}, id = {5700acd4-d1e2-3f87-94fe-7dcfca772551}, created = {2018-03-05T18:20:28.133Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.133Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ludascher2015}, source_type = {BOOK}, private_publication = {false}, bibtype = {book}, author = {Ludäscher, Bertram and Plale, Beth} }
@book{ title = {Regenerating and quantifying quality of benchmarking data using static and dynamic provenance}, type = {book}, year = {2015}, source = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, volume = {8628}, id = {71e9072b-27f6-321b-9374-31ba9adb4be4}, created = {2018-03-05T18:20:28.549Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.549Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ghoshal2015}, private_publication = {false}, abstract = {© Springer International Publishing Switzerland 2015. Application benchmarks are critical to establishing the performance of a new system or library. But benchmarking a system can be tricky and reproducing a benchmark result even trickier. Provenance can help. Referencing benchmarks and their results on similar platforms for collective comparison and evaluation requires capturing provenance related to the process of benchmark execution, programs involved and results generated. In this paper we define a formal model of benchmark applications and required provenance, describe an implementation of the model that employs compile time (static) and runtime provenance capture, and quantify data quality in the context of benchmarks. Our results show that through a mix of compile time and runtime provenance capture, we can enable higher quality benchmark regeneration.}, bibtype = {book}, author = {Ghoshal, D. and Chauhan, A. and Plale, B.}, doi = {10.1007/978-3-319-16462-55} }
@inproceedings{ title = {Workload-aware resource reservation for multi-tenant NoSQL}, type = {inproceedings}, year = {2015}, volume = {2015-Octob}, id = {2b7f90dc-045a-3448-9401-9d25824a59be}, created = {2018-03-05T18:20:28.623Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.623Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zeng2015}, private_publication = {false}, abstract = {© 2015 IEEE. Cloud hosted NoSQL data stores are for economic reasons often shared amongst multiple tenants simultaneously. The NoSQL provider consolidates multiple tenants access into a shared NoSQL instance and provides a dedicated view for each tenant. This multi-tenancy has tenants' data and workloads coexisting in the same node, which under certain conditions can lead to performance degradation of one tenant caused by another. In this paper, we investigate the multi-tenant interference in a common NoSQL store, HBase, and propose a resource reservation framework that reserves resources for prevention and dynamically adjusts the reservations according to tenant resource demands. The framework enforces cache reservation by splitting the cache space and disk reservation by scheduling requests to a distributed file system (DFS). A stochastic hill climbing algorithm is used to find a near-optimum plan for different resources reservations. Empirical results show that the framework can prevent interference and adapt to dynamic workloads under multi-tenancy.}, bibtype = {inproceedings}, author = {Zeng, J. and Plale, B.}, doi = {10.1109/CLUSTER.2015.14}, booktitle = {Proceedings - IEEE International Conference on Cluster Computing, ICCC} }
@inproceedings{ title = {Towards sustainable curation and preservation: The SEAD project's data services approach}, type = {inproceedings}, year = {2015}, id = {7e7a1b91-889c-3665-a623-1d0107b7efae}, created = {2018-03-05T18:20:28.979Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.979Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Myers2015}, private_publication = {false}, abstract = {© 2015 IEEE. When the effort to curate and preserve data is made at the end of a project, there is little opportunity to leverage ongoing research work to reduce curation costs or conversely, to leverage curation efforts to improve research productivity. In the Sustainable Environment Actionable Data (SEAD) project, we have envisioned a more active approach to data curation and preservation in which these processes occur in parallel with research and generate sufficient short and long-term return on researcher investments for self-interest to drive their adoption. In this paper, we describe the conceptual framework motivating the SEAD project and the suite of data services we have developed and deployed as an initial implementation of this approach. Use cases in which these services can reduce curation effort and aid ongoing research are highlighted and, based on our experience to date, we identify some key architectural features of our approach as well as open challenges to fully realizing the value of this approach in the broad ecosystem of cyberinfrastructure.}, bibtype = {inproceedings}, author = {Myers, J. and Hedstrom, M. and Akmon, D. and Payette, S. and Plale, B.A. and Kouper, I. and McCaulay, S. and McDonald, R. and Suriarachchi, I. and Varadharaju, A. and Kumar, P. and Elag, M. and Lee, J. and Kooper, R. and Marini, L.}, doi = {10.1109/eScience.2015.56}, booktitle = {Proceedings - 11th IEEE International Conference on eScience, eScience 2015} }
@book{ title = {Preface}, type = {book}, year = {2015}, source = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, volume = {8628}, id = {8048536e-10ab-31ae-af52-c95a2ce688f0}, created = {2018-03-05T18:20:29.591Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.591Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ludascher2015a}, private_publication = {false}, bibtype = {book}, author = {Ludäscher, B. and Plale, B.} }
@inproceedings{ title = {Building a Chemical-Protein Interactome on the Open Science Grid}, type = {inproceedings}, year = {2015}, pages = {15-20}, websites = {https://scitech.isi.edu/wordpress/wp-content/papercite-data/pdf/osg-splinter-2015.pdf}, id = {2b57c0cc-7f05-3dbc-952e-a021028f6e94}, created = {2020-04-22T20:56:46.608Z}, accessed = {2020-04-22}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-04-22T20:56:46.684Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {The Structural Protein-Ligand Interactome (SPLINTER) project predicts the interaction of thousands of small molecules with thousands of proteins. These interactions are predicted using the three-dimensional structure of the bound complex between each pair of protein and compound that is predicted by molecular docking. These docking runs consist of millions of individual short jobs each lasting only minutes. However, computing resources to execute these jobs (which cumulatively take tens of millions of CPU hours) are not readily or easily available in a cost effective manner. By looking to National Cyberinfrastructure resources, and specifically the Open Science Grid (OSG), we have been able to harness CPU power for researchers at the Indiana University School of Medicine to provide a quick and efficient solution to their unmet computing needs. Using the job submission infrastructure provided by the OSG, the docking data and simulation executable was sent to more than 100 universities and research centers worldwide. These op-portunistic resources provided millions of CPU hours in a matter of days, greatly reducing time docking simulation time for the research group. The overall impact of this approach allows researchers to identify small molecule candidates for individual proteins, or new protein targets for existing FDA-approved drugs and biologically active compounds.}, bibtype = {inproceedings}, author = {Quick, Rob and Hayashi, Soichi and Meroueh, Samy and Rynge, Mats and Teige, Scott and Wang, Bo and Xu, David and Sinica, Academia and Taipei, Taiwan}, doi = {https://doi.org/10.22323/1.239.0024}, booktitle = {International Symposium on Grids and Clouds (ISGC)} }
@techreport{ title = {Pervasive Technology Institute annual report: Research innovations and advanced cyberinfrastructure services in support of IU strategic goals during FY 2015}, type = {techreport}, year = {2015}, keywords = {CACR,D2I,DSC,NCGAS,PTI,RT,Technical Report,advanced cyberinfrastructure,engagement,outreach,research,storage,students}, websites = {http://hdl.handle.net/2022/20566}, id = {64f9bc12-50d8-3428-970c-41eaa6000531}, created = {2020-09-10T14:25:38.706Z}, accessed = {2020-09-10}, file_attached = {true}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-09-10T16:51:13.761Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {techreport}, author = {Stewart, Craig A.; Plale, Beth; Welch, Von; Link, Matthew R.; Miller, Therese; Wernert, Eric A.; Boyles, Michael J.; Fulton, Ben; Hancock, David Y.; Henschel, Robert; Michael, Scott A.; Pierce, Marlon; Ping, Robert J.; Gniady, Tassie; Fox, Geoffrey C.; Mi, Gary;} }
@techreport{ title = {Indiana University’s advanced cyberinfrastructure in service of IU strategic goals: Activities of the Research Technologies Division of UITS and National Center for Genome Analysis Support – two Pervasive Technology Institute cyberinfrastructure and servi}, type = {techreport}, year = {2015}, keywords = {ABITC,Clinical Affairs Schools,IUSM,NCGAS,PTI,advanced cyberinfrastructure,digital collections,engagement,health sciences,research,storage,students}, websites = {https://scholarworks.iu.edu/dspace/handle/2022/19805}, publisher = {Indiana University}, id = {86d2be61-fe4c-341a-8526-596ef8e9a373}, created = {2020-09-10T17:46:56.193Z}, accessed = {2020-09-10}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-09-10T17:46:56.193Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {techreport}, author = {Stewart, Craig A.; and Plale, Beth; and Welch, Von; and Fox, Geoffrey C.; and Link, Matthew R.; and Miller, Therese; and Wernert, Eric A.; and Boyles, Michael J.; and Fulton, Ben; and Hancock, David Y.; Henschel, Robert; and Michael, Scott A.; and Pierce, Marlon; and Ping, Robert J.; and Miksik, Gary; and Gniady, Tassie;} }
@inproceedings{ title = {Fast longest common subsequence with general integer scoring support on GPUs}, type = {inproceedings}, year = {2014}, id = {19807ad1-eaea-3f09-afce-20b81af5bf43}, created = {2018-03-05T18:20:21.387Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.387Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ozsoy2014a}, private_publication = {false}, abstract = {Graphic Processing Units (GPUs) have been gaining popularity among high-performance users. Certain classes of algorithms benefit greatly from the massive parallelism of GPUs. One such class of algorithms is longest common subsequence (LCS). Combined with bit parallelism, recent studies have been able to achieve terascale performance for LCS on GPUs. However, the reported results for the one-to-many matching problem lack correlation with weighted scoring algorithms. In this paper, we describe a novel technique to improve the score significance of the length of LCS algorithm for multiple matching. We extend the bit-vector algorithms for LCS to include integer scoring and parallelize them for hybrid CPU-GPU platforms. We benchmark our algorithm against the well-known sequence alignment algorithm on GPUs, CUDASW++, for accuracy and report performance on three different systems.}, bibtype = {inproceedings}, author = {Ozsoy, A. and Chauhan, A. and Swany, M.}, doi = {10.1145/2560683.2560690}, booktitle = {Proceedings of the 2014 International Workshop on Programming Models and Applications for Multicores and Manycores, PMAM 2014} }
@inproceedings{ title = {HPDC 2014 chairs' message}, type = {inproceedings}, year = {2014}, id = {a45cd6df-160e-3027-b81c-32a8ce7c3e94}, created = {2018-03-05T18:20:21.508Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.508Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2014}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, B. and Cappello, F. and Ripeanu, M. and Xu, D.}, booktitle = {HPDC 2014 - Proceedings of the 23rd International Symposium on High-Performance Parallel and Distributed Computing} }
@inproceedings{ title = {Implementing mpi_barrier with the netfpga}, type = {inproceedings}, year = {2014}, pages = {1}, publisher = {The Steering Committee of The World Congress in Computer Science, Computer Engineering and Applied Computing (WorldComp)}, id = {ae0ef648-d77c-3531-b2bf-c91d48456a4f}, created = {2018-03-05T18:20:22.256Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.256Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Arap2014b}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Arap, O and Brown, G and Himebaugh, B and Swany, M}, booktitle = {Proceedings of the International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA)} }
@inbook{ type = {inbook}, year = {2014}, pages = {632-643}, volume = {8632 LNCS}, websites = {https://doi.org/10.1007/978-3-319-09873-9_53}, publisher = {Springer International Publishing}, city = {Cham}, id = {6e20c6ab-9acb-3f39-87af-a1921d0eb79c}, created = {2018-03-05T18:20:22.438Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.438Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Arap2014}, source_type = {CHAP}, private_publication = {false}, abstract = {Collective operations play a key role in the performance of many high performance computing applications and are central to the widely used Message Passing Interface (MPI) programming model. In this paper we explore the use of programmable networking devices to accelerate the implementation of collective operations by offloading functionality to the underlying network. In our work we utilize a networked FPGA in conjunction with commercial OpenFlow switches supporting multicast. The union of hardware configurable network interfaces with Software Defined Networking (SDN) provides a significant opportunity to improve the performance of MPI applications that rely heavily on collective operations. The programmable interfaces implement collective operations in hardware using OpenFlow supported multicast. In our 8-node cluster, we observed up to 12% reduction in MPI_Allreduce latency in dynamic schemes employing SDN; and up to 22% reduction in static topologies. The results suggest more benefits if our approach is deployed in larger settings with low latency switches.}, bibtype = {inbook}, author = {Arap, Omer and Brown, Geoffrey and Himebaugh, Bryce and Swany, Martin}, editor = {Silva, Fernando and Dutra, Inês and Santos Costa, Vítor}, doi = {10.1007/978-3-319-09873-9_53}, chapter = {Software Defined Multicasting for MPI Collective Operation Offloading with the NetFPGA BT - Euro-Par 2014 Parallel Processing: 20th International Conference, Porto, Portugal, August 25-29, 2014. Proceedings}, title = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Characterization of Emergent Data Networks Among Long-Tail Data}, type = {inproceedings}, year = {2014}, volume = {16}, id = {3b504950-eba6-3d6f-a506-3d5896534319}, created = {2018-03-05T18:20:22.874Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.874Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Elag2014}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Elag, Mostafa and Kumar, Praveen and Hedstrom, Margaret and Myers, James and Plale, Beth and Marini, Luigi and McDonald, Robert}, booktitle = {EGU General Assembly Conference Abstracts} }
@inproceedings{ title = {Parallel and quantitative sequential pattern mining for large-scale interval-based temporal data}, type = {inproceedings}, year = {2014}, pages = {32-39}, publisher = {IEEE}, id = {9c8a1293-f263-333f-ad49-036856015391}, created = {2018-03-05T18:20:22.914Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.914Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ruan2014}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Ruan, Guangchen and Zhang, Hui and Plale, Beth}, booktitle = {Big Data (Big Data), 2014 IEEE International Conference on} }
@inproceedings{ title = {Regenerating and Quantifying Quality of Benchmarking Data Using Static and Dynamic Provenance}, type = {inproceedings}, year = {2014}, pages = {56-67}, publisher = {Springer}, id = {559f81b6-5618-3dc6-9291-c7ebad50f625}, created = {2018-03-05T18:20:23.242Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.242Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ghoshal2014a}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Ghoshal, Devarshi and Chauhan, Arun and Plale, Beth}, booktitle = {International Provenance and Annotation Workshop} }
@article{ title = {Hierarchical MapReduce: Towards simplified cross-domain data processing}, type = {article}, year = {2014}, volume = {26}, id = {204106c5-b757-3ad9-963a-d7d33fdded64}, created = {2018-03-05T18:20:23.756Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.756Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Luo2014}, private_publication = {false}, abstract = {The MapReduce programming model has proven useful for data-driven high throughput applications. However, the conventional MapReduce model limits itself to scheduling jobs within a single cluster. As job sizes become larger, single-cluster solutions grow increasingly inadequate. We present a hierarchical MapReduce framework that utilizes computation resources from multiple clusters simultaneously to run MapReduce job across them. The applications implemented in this framework adopt the Map-Reduce-GlobalReduce model where computations are expressed as three functions: Map, Reduce, and GlobalReduce. Two scheduling algorithms are proposed, one that targets compute-intensive jobs and another data-intensive jobs, evaluated using a life science application, AutoDock, and a simple Grep. Data management is explored through analysis of the Gfarm file system.Copyright © 2012 John Wiley & Sons, Ltd.}, bibtype = {article}, author = {Luo, Y. and Plale, B. and Guo, Z. and Li, W.W. and Qiu, J. and Sun, Y.}, doi = {10.1002/cpe.2929}, journal = {Concurrency Computation Practice and Experience}, number = {4} }
@inproceedings{ title = {Study in usefulness of middleware-only provenance}, type = {inproceedings}, year = {2014}, volume = {1}, id = {d2b65aa3-e1a5-3d25-98ea-527c1962c324}, created = {2018-03-05T18:20:24.315Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.315Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zhou2014}, private_publication = {false}, abstract = {© 2014 IEEE. Data provenance is the lineage of a digital artifact or object. Its capture in workflow-controlled distributed applications is well studied but less is known about quality of provenance captured solely through existing control infrastructures (i.e., middleware frameworks used for high throughput computing). We study completeness of provenance in case where information is only available from the middleware layer. We use WorkQueue to validate our model. Our evaluation shows that provenance captured from a middleware framework is sufficient to represent the existence of output data and trace certain failures independent of the application semantics. We show the method's limitations as well.}, bibtype = {inproceedings}, author = {Zhou, Q. and Ghoshal, D. and Plale, B.}, doi = {10.1109/eScience.2014.49}, booktitle = {Proceedings - 2014 IEEE 10th International Conference on eScience, eScience 2014} }
@article{ title = {Author gender metadata augmentation of hathitrust digital library}, type = {article}, year = {2014}, volume = {51}, id = {ffd5aa08-4685-34c6-a713-af1323936f1d}, created = {2018-03-05T18:20:24.385Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.385Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Peng2014}, private_publication = {false}, abstract = {Bibliographic metadata is essential for digital library resource description. Especially as the size and number of bibliographic entities grows, high-quality metadata enables richer forms of digital library access, search, and use. Metadata records can be enriched through automated techniques. For example, a digital humanities scholar might use the gender of a set of authors during their literature analysis. In this study, we undertook to enrich the metadata description of a large-scale digital library, the HathiTrust (HT) digital library, specifically by determining the gender of authors of the public domain portion of the collection. The results are stored to a separate Solr index accessible through the HathiTrust Research Center services. This study, which successfully resolved in 78.9% of the cases the gender of authors in the HT public domain corpus, suggests future research directions in capturing and representing the provenance of the contributing sources to enhance trust, and in machine learning to resolve the remaining names.}, bibtype = {article}, author = {Peng, Z. and Chen, M. and Kowalczyk, S. and Plale, B.}, doi = {10.1002/meet.2014.14505101098}, journal = {Proceedings of the ASIST Annual Meeting}, number = {1} }
@article{ title = {Explicit semantic path mining via Wikipedia knowledge tree}, type = {article}, year = {2014}, volume = {51}, id = {218c1bee-fea9-3e85-8e93-2dc75bdaa9c6}, created = {2018-03-05T18:20:24.867Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.867Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Xia2014}, private_publication = {false}, abstract = {While classical bag-of-word (BoG) approaches represent text content in the word level, recent studies show that knowledge-based concept indexation is a promising approach to further enhance the text search and mining performance. In this study, we propose a new knowledge indexation/extraction method, Explicit Semantic Path Mining (ESPM), for knowledge-base text mining. It has roots in a concept-based vector constructing method, Explicit Semantic Analysis (ESA), which has shown success in text mining tasks. For this new method, given an input piece of text, ESPM can efficiently identify the independent and optimized semantic path(s) on a concept map, which is, in this study, the Wikipedia category tree. Unlike earlier studies focusing on BoG based vector space, ESPM is a semantic path mining algorithm, which generates the top down semantic categories of a given text by leveraging the rich link information between Wikipedia categories and articles. Preliminary experiment based on ODP data shows ESPM delivers high quality independent semantic paths from both precision and ranking viewpoints.}, bibtype = {article}, author = {Xia, T. and Chen, M. and Liu, X.}, doi = {10.1002/meet.2014.14505101160}, journal = {Proceedings of the ASIST Annual Meeting}, number = {1} }
@inproceedings{ title = {Cloud computing data capsules for non-consumptiveuse of texts}, type = {inproceedings}, year = {2014}, pages = {9-16}, publisher = {ACM}, id = {40fef2e4-a438-3a3a-9dc3-29f62df075ac}, created = {2018-03-05T18:20:24.917Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.917Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zeng2014}, source_type = {CONF}, private_publication = {false}, abstract = {As digital data sources grow in number and size, they pose an opportunity for computational investigation by means of text mining, natural language processing (NLP), and other text analysis techniques. In this paper we propose a virtual machine (VM) framework and methodology for nonconsumptive text analysis. Using a remote VM model, the VM is configured with software and tooling for text analysis. When completed, the VM is wiped out and resources released for other users to share. Our approach extends the VM by turning it into a data capsules that prevents leakage of copyrighted content in the event that the VM is compromised. The HathiTrust Research Center Data Capsules has seen early use in application against the HathiTrust repository of digitized books from university libraries nationwide. Copyright 2014 ACM.}, bibtype = {inproceedings}, author = {Zeng, Jiaan and Ruan, Guangchen and Crowell, Alexander and Prakash, Atul and Plale, Beth}, doi = {10.1145/2608029.2608031}, booktitle = {Proceedings of the 5th ACM workshop on Scientific cloud computing} }
@inbook{ type = {inbook}, year = {2014}, pages = {257-276}, publisher = {Springer}, id = {f8d0c339-140b-3a8b-bae8-1dc8b00ece07}, created = {2018-03-05T18:20:25.345Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.345Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chakraborty2014}, source_type = {CHAP}, private_publication = {false}, bibtype = {inbook}, author = {Chakraborty, Abhirup and Pathirage, Milinda and Suriarachchi, Isuru and Chandrasekar, Kavitha and Mattocks, Craig and Plale, Beth}, chapter = {Executing Storm Surge Ensembles on PAAS Cloud}, title = {Cloud Computing for Data-Intensive Applications} }
@article{ title = {Provenance quality assessment methodology and framework}, type = {article}, year = {2014}, volume = {5}, id = {c446ba71-80c2-3c79-8efb-62e07a9598df}, created = {2018-03-05T18:20:25.557Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.557Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cheah2014}, private_publication = {false}, abstract = {© 2014 ACM. Data provenance, a form of metadata describing the life cycle of a data product, is crucial in the sharing of research data. Research data, when shared over decades, requires recipients to make a determination of both use and trust. That is, can they use the data? More importantly, can they trust it? Knowing the data are of high quality is one factor to establishing fitness for use and trust. Provenance can be used to assert the quality of the data, but the quality of the provenance must be known as well. We propose a framework for assessing the quality of data provenance. We identify quality issues in data provenance, establish key quality dimensions, and define a framework of analysis. We apply the analysis framework to synthetic and real-world provenance.}, bibtype = {article}, author = {Cheah, Y.-W. and Plale, B.}, doi = {10.1145/2665069}, journal = {Journal of Data and Information Quality}, number = {3} }
@book{ title = {Software Defined Multicasting for MPI Collective Operation Offloading with the NetFPGA}, type = {book}, year = {2014}, source = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, id = {34ffdd4d-9e22-37bc-af5b-0db1b3ed7e64}, created = {2018-03-05T18:20:26.352Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.352Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Arap2014c}, private_publication = {false}, bibtype = {book}, author = {Arap, O. and Brown, G. and Himebaugh, B. and Swany, M.}, doi = {10.1007/978-3-319-09873-9_53} }
@inproceedings{ title = {Semantic annotation with RescoredESA: Rescoring concept features generated from explicit semantic analysis}, type = {inproceedings}, year = {2014}, id = {de894e56-0847-3bd5-b069-e17b64ce9084}, created = {2018-03-05T18:20:26.568Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.568Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Jiang2014}, private_publication = {false}, abstract = {Copyright 2014 ACM. Concepts have been used extensively in semantic annotating. Explicit Semantic Analysis (ESA) is a concept feature generator, which represents text by a concept-level vector, such as a vector of Wikipedia concepts [3, 4, 8] . It is also considered a human-friendly way to annotate text - it generates concept vector that can be easily interpreted by human. We propose an approach, RescoredESA, based on ESA, according to aspects upon which ESA can enhance: 1) sometimes the output vectors do not assign high scores to concepts relevant to the text; 2) it considers words in the text when representing the text to concept-level vector while not considering the concepts explicitly occurring in the text, which can be an important source for assigning scores to ESA vector dimensions. We evaluate it against the 20 newsgroup classification task, and the result shows a slight enhancement when combining vectors from RescoredESA and bag-of-words.}, bibtype = {inproceedings}, author = {Jiang, Z. and Chen, M. and Liu, X.}, doi = {10.1145/2663712.2666192}, booktitle = {ESAIR 2014 - Proceedings of the 7th International Workshop on Exploiting Semantic Annotations in Information Retrieval, co-located with CIKM 2014} }
@article{ title = {Optimizing LZSS compression on GPGPUs}, type = {article}, year = {2014}, volume = {30}, id = {2a985da6-a7a7-31b8-b32e-ee2db1e31445}, created = {2018-03-05T18:20:26.828Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.828Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ozsoy2014}, private_publication = {false}, abstract = {In this paper, we present an algorithm and provide design improvements needed to port the serial Lempel-Ziv-Storer-Szymanski (LZSS), lossless data compression algorithm, to a parallelized version suitable for general purpose graphic processor units (GPGPU), specifically for NVIDIA's CUDA Framework. The two main stages of the algorithm, substring matching and encoding, are studied in detail to fit into the GPU architecture. We conducted detailed analysis of our performance results and compared them to serial and parallel CPU implementations of LZSS algorithm. We also benchmarked our algorithm in comparison with well known, widely used programs: GZIP and ZLIB. We achieved up to 34× better throughput than the serial CPU implementation of LZSS algorithm and up to 2.21× better than the parallelized version. © 2013 Elsevier B.V. All rights reserved.}, bibtype = {article}, author = {Ozsoy, A. and Swany, M. and Chauhan, A.}, doi = {10.1016/j.future.2013.06.022}, journal = {Future Generation Computer Systems}, number = {1} }
@techreport{ title = {Indiana University Digitization Master Plan}, type = {techreport}, year = {2014}, id = {edb81629-035b-3cc1-9653-ec93adc3b6f2}, created = {2018-03-05T18:20:27.302Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.302Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lewis2014}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Lewis, David and Plale, Beth} }
@techreport{ title = {Data Management Strategies for Scientific Applications in Cloud Environments}, type = {techreport}, year = {2014}, publisher = {Ernest Orlando Lawrence Berkeley National Laboratory, Berkeley, CA (US)}, id = {e58fb3a4-859f-3f21-94bd-851b7cf87313}, created = {2018-03-05T18:20:28.101Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.101Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ghoshal2014}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Ghoshal, Devarshi and Hendrix, Valerie and Feller, Eugen and Morin, Christine and Plale, Beth and Ramakrishnan, Lavanya} }
@article{ title = {Synthesis of working group and interest group activity one year into the research data alliance}, type = {article}, year = {2014}, volume = {20}, id = {6acc4082-ba8e-377c-ba3f-ac6ebb319a1e}, created = {2018-03-05T18:20:28.386Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.386Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Plale2014a}, private_publication = {false}, abstract = {The Research Data Alliance (RDA) uses Working Groups and Interest Groups to carry out its work. Groups form when a concerned community develops around a topic for which there are well defined issues, common goals, and an opportunity to create a framework for timely action. One year in, RDA has 26 Working Groups and Interest Groups whose activities are focused on overcoming barriers to successful research data sharing, publishing, referencing and archiving, and on developing the infrastructure necessary to support those tasks. © 2014 Beth Plale.}, bibtype = {article}, author = {Plale, B.}, doi = {10.1045/january2014-plale}, journal = {D-Lib Magazine}, number = {1-2} }
@inproceedings{ title = {Multi-tenant fair share in NoSQL data stores}, type = {inproceedings}, year = {2014}, id = {b541ba4c-b2df-35fb-810f-65860170feeb}, created = {2018-03-05T18:20:29.424Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.424Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zeng2014a}, private_publication = {false}, abstract = {© 2014 IEEE. NoSQL data stores see considerable attention today in big data, cloud hosted environments because of their fault tolerance, distribution and high availability. Shared NoSQL data stores are preferred for their ability to serve multiple tenants simultaneously which can improve resource utilization and lower management costs. Fair share in this setting can be a problem in that NoSQL data stores can be weak in preventing interference between tenants. We propose a methodology for multi-tenant fair share in a NoSQL store, in particular Cassandra. The approach uses an extended version of the deficit round robin algorithm to schedule tenant requests, and has local weight adjustment and slow tenant handling to improve the system throughput. Empirical results show that our approach is able to provide fair share for multi-tenancy.}, bibtype = {inproceedings}, author = {Zeng, J. and Plale, B.}, doi = {10.1109/CLUSTER.2014.6968761}, booktitle = {2014 IEEE International Conference on Cluster Computing, CLUSTER 2014} }
@article{ title = {Temporal representation for mining scientific data provenance}, type = {article}, year = {2014}, volume = {36}, id = {d0e2ed58-b203-30ff-932e-1c5b69714fd5}, created = {2018-03-05T18:20:29.451Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.451Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2014}, private_publication = {false}, abstract = {Provenance of digital scientific data is a distinct piece of metadata about a data object. It can serve as a "ground-truth" for determining the cause of execution failure for instance, or can explain a particular result to a researcher intending to reuse a data object. Provenance can quickly grow voluminous and be quite feature rich, requiring new structure and concepts that support data mining. We propose a representation of data provenance using logical time that reduces the feature space of the provenance. The temporal representation supports clustering, classification and association rule mining. This paper studies the full utility of the temporal representation through an empirical evaluation and identification of the data mining algorithms that are most effective in application to the proposed representation. The evaluation is carried out against a multi-gigabyte semi-synthetic provenance dataset built from a range of scientific workflows, and against a real one month provenance dataset gathered from a satellite instrument. Through analysis of the results via clustering metrics - purity and Normalized Mutual Information (NMI), we determine that the k-means algorithm gives the best clustering with the proposed temporal representation, while still yielding provenance-useful information. © 2013 Elsevier B.V. All rights reserved.}, bibtype = {article}, author = {Chen, P. and Plale, B. and Aktas, M.S.}, doi = {10.1016/j.future.2013.09.032}, journal = {Future Generation Computer Systems} }
@techreport{ title = {2013 annual report on training, education, and outreach activities of the Indiana University Pervasive Technology Institute and affiliated organizations}, type = {techreport}, year = {2014}, keywords = {Technical Report}, websites = {http://hdl.handle.net/2022/17581}, id = {09830637-3262-3885-a2d1-746fe7c9f640}, created = {2020-09-10T21:24:49.259Z}, accessed = {2020-09-10}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-09-10T21:24:49.259Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This report summarizes training, education, and outreach activities for calendar 2013 of PTI and affiliated organizations, including the School of Informatics and Computing, Office of the Vice President for Information Technology, and Maurer School of Law. Reported activities include those led by PTI Research Centers (Center for Applied Cybersecurity Research, Center for Research in Extreme Scale Technologies, Data to Insight Center, Digital Science Center) and Service and Cyberinfrastructure Centers (Research Technologies Division of University Information Technology Services, National Center for Genome Assembly Support)}, bibtype = {techreport}, author = {Ping, Robert J and Miller, Therese and Plale, Beth and Stewart, Craig} }
@inproceedings{ title = {Modeling heterogeneous data resources for social-ecological research: A data-centric perspective}, type = {inproceedings}, year = {2013}, id = {0248a976-3048-3f83-bfe0-ffd3c506f88b}, created = {2018-03-05T18:20:21.678Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.678Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2013}, private_publication = {false}, abstract = {Digital repositories are grappling with an influx of scientific data brought about by the well publicized "data deluge" in science, business, and society. One particularly perplexing problem is the long-term archival and reuse of complex data sets. This paper presents an integrated approach to data discovery over heterogeneous data resources in social-ecological systems research. Social-ecological systems data is complex because the research draws from both social and natural sciences. Using a sample set of data resources from the domain, we explore an approach to discovery and representation of this data. Specifically, we develop an ontology-based process of organization and visualization from a data-centric perspective. We define data resources broadly and identify six key categories of resources that include data collected from site visits to shared ecological resources, the structure of research instruments, domain concepts, research designs, publications, theories and models. We identify the underlying relationships and construct an ontology that captures these relationships using semantic web languages. The ontology and a NoSQL data store at the back end store the data resource instances. These are integrated into a portal architecture we refer to as the Integrated Visualization of Social-Ecological Resources (IViSER) that allows users to both browse the relationships captured in the ontology and easily visualize the granular details of data resources. Copyright © 2013 by the Association for Computing Machinery, Inc. (ACM).}, bibtype = {inproceedings}, author = {Chen, M. and Pavalanathan, U. and Jensen, S. and Plale, B.}, doi = {10.1145/2467696.2467737}, booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries} }
@inproceedings{ title = {The SEAD datanet prototype: Data preservation services for sustainability science}, type = {inproceedings}, year = {2013}, id = {6ea0a15e-f6e1-35e8-bd92-3612f1b4bda5}, created = {2018-03-05T18:20:21.734Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.734Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Plale2013f}, private_publication = {false}, abstract = {In this poster we will present the SEAD project [1] and its prototype software and describe how SEAD approaches long-term data preservation and access through multiple partnerships and how it supports sustainability science researchers in their data management, analysis and archival needs. SEAD's initial prototype system currently is being tested by ingesting datasets from the National Center for Earth Surface Dynamics (1.6 terabyte of data containing over 450,000 files) [2] and packaging them for transmission to long-term archival storage. Copyright © 2013 by the Association for Computing Machinery, Inc. (ACM).}, bibtype = {inproceedings}, author = {Plale, B. and McDonald, R.H. and Chandrasekar, K. and Kouper, I. and Light, R. and Konkiel, S.R. and Hedstrom, M. and Myers, J. and Kumar, P.}, doi = {10.1145/2467696.2467762}, booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries} }
@inproceedings{ title = {Efficient wide area data transfer protocols for 100 Gbps networks and beyond}, type = {inproceedings}, year = {2013}, id = {42b9f560-d3d8-32e7-8d42-ebfd1a778771}, created = {2018-03-05T18:20:21.833Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.833Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kissel2013}, private_publication = {false}, abstract = {Due to a number of recent technology developments, now is the right time to re-examine the use of TCP for very large data transfers. These developments include the deployment of 100 Gigabit per second (Gbps) network backbones, hosts that can easily manage 40 Gbps, and higher, data transfers, the Science DMZ model, the availability of virtual circuit technology, and wide-area Remote Direct Memory Access (RDMA) protocols. In this paper we show that RDMA works well over wide-area virtual circuits, and uses much less CPU than TCP or UDP. We also characterize the limitations of RDMA in the presence of other traffic, including competing RDMA flows. We conclude that RDMA for Science DMZ to Science DMZ transfers of massive data is a viable and desirable option for high-performance data transfer. Copyright 2013 ACM.}, bibtype = {inproceedings}, author = {Kissel, E. and Swany, M. and Tierney, B. and Pouyoul, E.}, doi = {10.1145/2534695.2534699}, booktitle = {Proc. of NDM 2013: 3rd Int. Workshop on Network-Aware Data Management - Held in Conjunction with SC 2013: The Int. Conference for High Performance Computing, Networking, Storage and Analysis} }
@inproceedings{ title = {Exploiting mapreduce and data compression for data-intensive applications}, type = {inproceedings}, year = {2013}, id = {4b4ccc43-833c-3276-842d-2511d0dd4291}, created = {2018-03-05T18:20:21.880Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.880Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ruan2013}, private_publication = {false}, abstract = {HPC platform shows good success for predominantly computeintensive jobs, however, data intensive jobs still struggle on HPC platform as large amounts of concurrent data movement from I/O nodes to compute nodes can easily saturate the network links. MapReduce, the "moving computation to data" paradigm for many pleasingly parallel applications, assumes that data are resident on local disks and computation is scheduled where the data are located. However, on an HPC machine data must be staged from a broader file system (such as Luster), to HDFS where it can be accessed; this staging can represent a substantial delay in processing. In this paper we look at data compression's effect on reducing bandwidth needs of getting data to the application, as well as its impact on the overall performance of data-intensive applications. Our study examines two types of applications, a 3D-time series caries lesion assessment focusing on large scale medical image dataset, and a HTRC word counting task concerning large scale text analysis running on XSEDE resources. Our extensive experimental results demonstrate significant performance improvement in terms of storage space, data stage-in time, and job execution time. © 2013 by the Association for Computing Machinery, Inc.}, bibtype = {inproceedings}, author = {Ruan, G. and Zhang, H. and Plale, B.}, doi = {10.1145/2484762.2484785}, booktitle = {ACM International Conference Proceeding Series} }
@inproceedings{ title = {Provenance from log files: a BigData problem}, type = {inproceedings}, year = {2013}, keywords = {2013,pti}, pages = {290-297}, websites = {http://d2i.indiana.edu/pubs/provenance-log-files-bigdata-problem}, publisher = {ACM}, city = {Genoa, Italy}, id = {8c7dbff5-29b4-39eb-b431-0327f67c1e12}, created = {2018-03-05T18:20:22.049Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.049Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ghoshal2013}, source_type = {proceedings}, private_publication = {false}, abstract = {As new data products of research increasingly become the product or output of complex processes, the lineage of the resulting products takes on greater importance as a description of the processes that contributed to the result. Without adequate description of data products, their reuse is lessened. The act of instrumenting an application for provenance capture is burdensome, however. This paper explores the option of deriving provenance from existing log files, an approach that reduces the instrumentation task substantially but raises questions about sifting through huge amounts of information for what may or may not be complete provenance. In this paper we study the tradeoff of ease of capture and provenance completeness, and show that under some circumstances capture through logs can result in high quality provenance.}, bibtype = {inproceedings}, author = {Ghoshal, Devarshi and Plale, Beth}, doi = {10.1145/2457317.2457366}, booktitle = {Proceedings of the Joint EDBT/ICDT 2013 Workshops} }
@inproceedings{ title = {DEM generation with SAR interferometry based on weighted wavelet phase unwrapping}, type = {inproceedings}, year = {2013}, id = {607a5233-4656-30f8-b4fe-7909216cc3f7}, created = {2018-03-05T18:20:22.231Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.231Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Rahnemoonfar2013}, private_publication = {false}, abstract = {Synthetic aperture radar Interferometry (InSAR) is a significant 3D imaging technique to generate a Digital Elevation Model (DEM). The phase difference between the complex SAR images displays an interference fringe pattern from which the elevation of any point in the imaged terrain can be determined. Phase unwrapping is the most critical step in the signal processing of InSAR and especially in DEM generation. In this paper, a least squares weighted wavelet technique is used which overcomes the problem of slow convergence and the less-accurate Gauss-Seidel method. Here, by decomposing a grid to low-frequency and high-frequency components, the problem for a low-frequency component is solved. The technique is applied to ENVISAT ASAR images of Bam area. The experimental results compared with the Statistical-Cost Network Flow approach and the DEM generated from a 1/25000 scale map of the area shows the effectiveness of the proposed method. © 2013 IEEE.}, bibtype = {inproceedings}, author = {Rahnemoonfar, M. and Plale, B.}, doi = {10.1109/COMGEO.2013.14}, booktitle = {Proceedings - 2013 4th International Conference on Computing for Geospatial Research and Application, COM.Geo 2013} }
@article{ title = {Provenance capture and use in a satellite data processing pipeline}, type = {article}, year = {2013}, volume = {51}, id = {bf81b9f4-fc99-30b7-9beb-240363c2edb9}, created = {2018-03-05T18:20:23.272Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.272Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Jensen2013}, private_publication = {false}, abstract = {With the interdependencies that exist between data in a scientific processing pipeline, the ability to track the provenance of the scientific process through multiple stages is necessary to determining the usability of the resulting data product. In this paper, we study the capture of provenance from an existing NASA instrument ingest pipeline. Since instrumenting the scientific code for a production system is not feasible, we show how provenance events can be scavenged from log files to generate detailed provenance graphs. Through extensions to the Karma provenance system, which have been implemented on a test instance of the AMSR-E production data pipeline, we determine that when the volume of provenance information is high, provenance graph visualizations provide a good tool for monitoring the ingest pipeline and identifying processing differences in ways not seen before. Two novel uses of provenance that we present in this paper are comparisons between processing runs and forward provenance for viewing downstream dependencies. © 1980-2012 IEEE.}, bibtype = {article}, author = {Jensen, S. and Plale, B. and Aktas, M.S. and Luo, Y. and Chen, P. and Conover, H.}, doi = {10.1109/TGRS.2013.2266929}, journal = {IEEE Transactions on Geoscience and Remote Sensing}, number = {11} }
@inproceedings{ title = {Message from the research track chairs}, type = {inproceedings}, year = {2013}, id = {c0e5d45f-dae5-3b0b-8154-17b1da03331a}, created = {2018-03-05T18:20:23.500Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.500Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Plale2013e}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, B. and Lyu, M.R. and Zhang, J.}, doi = {10.1109/ICWS.2013.6}, booktitle = {Proceedings - IEEE 20th International Conference on Web Services, ICWS 2013} }
@inproceedings{ title = {perfSONAR: On-board diagnostics for big data}, type = {inproceedings}, year = {2013}, id = {c015a463-2807-3e40-b866-3bb98da29586}, created = {2018-03-05T18:20:23.567Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.567Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zurawski2013}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Zurawski, J and Balasubramanian, S and Brown, A and Kissel, E and Lake, A and Swany, M and Tierney, B and Zekauskas, M}, booktitle = {IEEE International Conference on Big Data} }
@inproceedings{ title = {Static compiler analysis for workflow provenance}, type = {inproceedings}, year = {2013}, id = {a0ec3456-18c2-32f1-b103-10d7bd6cd8ad}, created = {2018-03-05T18:20:23.718Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.718Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ghoshal2013a}, private_publication = {false}, abstract = {Data provenance is the lineage of an artifact or object. Provenance can provide a basis upon which data can be regenerated, and can be used to determine the quality of both the process and provenance itself. Provenance capture from workflows is comprised of capturing data dependencies as and when a workflow executes. We propose a layered provenance model which identifies and stores provenance at different granularities statically by analyzing the source code of programs. We use this model to capture provenance from both workflows and modules within workflows. This paper contributes a static compile time analysis methodology that includes a logical layered provenance model to convert workflow provenance from black box to white box, where the precise mapping between the inputs and outputs of a task can be known.}, bibtype = {inproceedings}, author = {Ghoshal, D. and Chauhan, A. and Plale, B.}, doi = {10.1145/2534248.2534250}, booktitle = {Proceedings of WORKS 2013: 8th Workshop on Workflows in Support of Large-Scale Science - Held in conjunction with SC 2013: The International Conference for High Performance Computing, Networking, Storage and Analysis} }
@article{ title = {SEAD virtual archive: Building a federation of institutional repositories for long-term data preservation in sustainability science}, type = {article}, year = {2013}, pages = {172-180}, volume = {8}, id = {c4f6c518-e33d-34ef-9aea-5ade0e91b6b6}, created = {2018-03-05T18:20:23.719Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.719Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2013d}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Plale, Beth and McDonald, Robert H and Chandrasekar, Kavitha and Kouper, Inna and Konkiel, Stacy and Hedstrom, Margaret L and Myers, James and Kumar, Praveen}, journal = {International Journal of Digital Curation}, number = {2} }
@book{ title = {Unmanaged workflows: Their provenance and use}, type = {book}, year = {2013}, source = {Studies in Computational Intelligence}, volume = {426}, id = {2023223d-f2ba-34c2-9ac6-f2912cdd6c7c}, created = {2018-03-05T18:20:24.386Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.386Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Aktas2013}, private_publication = {false}, abstract = {Provenance of scientific data will play an increasingly critical role as scientists are encouraged by funding agencies and grand challenge problems to share and preserve scientific data. But it is foolhardy to believe that all human processes, particularly as varied as the scientific discovery process, will be fully automated by a workflow system. Consequently, provenance capture has to be thought of as a problem applied to both human and automated processes. The unmanaged workflow is the full human-driven activity, encompassing tasks whose execution is automated by an orchestration tool, and tasks that are done outside an orchestration tool. In this chapter we discuss the implications of the unmanaged workflow as it affects provenance capture, representation, and use. Illustrations of capture include multiple experiences with unmanaged capture using the Karma tool. Illustrations of use include defining workflows by suggesting additions to workflow designs under construction, reconstructing process traces, and using analysis tools to assess provenance quality. © Springer-Verlag Berlin Heidelberg 2013.}, bibtype = {book}, author = {Aktas, M.S. and Plale, B. and Leake, D. and Mukhi, N.K.}, doi = {10.1007/978-3-642-29931-5-3} }
@inproceedings{ title = {HathiTrust research center: computational access for digital humanities and beyond}, type = {inproceedings}, year = {2013}, keywords = {2013,pti}, websites = {http://d2i.indiana.edu/node/16510}, publisher = {ACM}, city = {Indianapolis, IN}, id = {d723c93d-7b4a-3066-a639-4a86082f1b2f}, created = {2018-03-05T18:20:24.566Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.566Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2013}, source_type = {inproceedings}, notes = {<b>From Duplicate 2 (<i>HathiTrust Research Center: Computational Access for Digital Humanities and Beyond</i> - Plale, Beth; McDonald, Robert H; Sun, Yiming; Kouper, Inna; Cobine, Ryan; Downie, J Stephen; Namachchivaya, Beth S; Unsworth, John)<br/></b><br/>Poster}, private_publication = {false}, abstract = {Academic libraries are increasingly looking to provide services that allow their users to work with digital collections in innovative ways, for example, to analyze large volumes of digitized collections. The HathiTrust Research Center (HTRC) is a large collaborative that provides an innovative research infrastructure for dealing with massive amounts of digital texts. In this poster, we report on the technical progress of the HTRC as well as on the efforts to build a user community around our cyberinfrastructure. Copyright © 2013 by the Association for Computing Machinery, Inc. (ACM).}, bibtype = {inproceedings}, author = {Plale, Beth and McDonald, Robert H and Sun, Yiming and Kouper, Inna and Cobine, Ryan and Downie, J Stephen and Namachchivaya, B.S. Beth S and Unsworth, John and Stephen Downie, J. and Namachchivaya, B.S. Beth S and Unsworth, John}, doi = {10.1145/2467696.2467767}, booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries} }
@inproceedings{ title = {Changing the Curation Equation: A Data Lifecycle Approach to Lowering Costs and Increasing Value}, type = {inproceedings}, year = {2013}, id = {9dabccad-76c0-3756-9b20-e921512ce49c}, created = {2018-03-05T18:20:24.767Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.767Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Myers2013}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Myers, J and Hedstrom, M and Plale, B A and Kumar, P and McDonald, R and Kooper, R and Marini, L and Kouper, I and Chandrasekar, K}, booktitle = {AGU Fall Meeting Abstracts} }
@article{ title = {Towards Tera-Scale Performance for Longest Common Subsequence Using Graphics Processor}, type = {article}, year = {2013}, id = {0912c85e-2760-3954-a9e4-f783a50921ed}, created = {2018-03-05T18:20:24.982Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.982Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ozsoy2013a}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Ozsoy, Adnan and Chauhan, Arun and Swany, Martin}, journal = {IEEE Supercomputing (SC)} }
@inproceedings{ title = {Data Sets, Ensemble Cloud Computing, and the University Library}, type = {inproceedings}, year = {2013}, id = {ea5a28fb-f063-39e4-81b6-e5d3374d2cbc}, created = {2018-03-05T18:20:25.334Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.334Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2013a}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, B A}, booktitle = {AGU Fall Meeting Abstracts} }
@techreport{ title = {Big Data and HPC: Exploring Role of Research Data Alliance (RDA), a Report On Supercomputing 2013 Birds of a Feather}, type = {techreport}, year = {2013}, id = {f65a6c44-03ac-3626-8d63-56723c1589a3}, created = {2018-03-05T18:20:25.643Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.643Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2013b}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth} }
@inproceedings{ title = {Dependency provenance in agent based modeling}, type = {inproceedings}, year = {2013}, id = {4d089cca-c46c-3be7-afb6-f65a5b537b46}, created = {2018-03-05T18:20:25.736Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.736Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2013a}, private_publication = {false}, abstract = {Researchers who use agent-based models (ABM) to model social patterns often focus on the model's aggregate phenomena. However, aggregation of individuals complicates the understanding of agent interactions and the uniqueness of individuals. We develop a method for tracing and capturing the provenance of individuals and their interactions in the NetLogo ABM, and from this create a "dependency provenance slice", which combines a data slice and a program slice to yield insights into the cause-effect relations among system behaviors. To cope with the large volume of fine-grained provenance traces, we propose use-inspired filters to reduce the amount of provenance, and a provenance slicing technique called "non-preprocessing provenance slicing" that directly queries over provenance traces without recovering all provenance entities and dependencies beforehand. We evaluate performance and utility using a well known ecological NetLogo model called "wolf-sheep- predation". Copyright © 2013 by The Institute of Electrical and Electronics Engineers, Inc.}, bibtype = {inproceedings}, author = {Chen, P. and Plale, B. and Evans, T.}, doi = {10.1109/eScience.2013.39}, booktitle = {Proceedings - IEEE 9th International Conference on e-Science, e-Science 2013} }
@inproceedings{ title = {Big Data Opportunities and Challenges for IR, Text Mining and NLP}, type = {inproceedings}, year = {2013}, keywords = {2013,pti}, volume = {Proceeding}, websites = {http://dl.acm.org/citation.cfm?id=2514739}, publisher = {ACM}, id = {6045a757-4dfc-309f-b837-35807c1b125e}, created = {2018-03-05T18:20:26.010Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.010Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2013g}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, Beth}, doi = {10.1145/2513549.2514739}, booktitle = {UnstructureNLP '13 Proceedings of the 2013 international workshop on Mining unstructured big data using natural language processing} }
@inproceedings{ title = {Automatic performance evaluation of dewarping methods in large scale digitization of historical documents}, type = {inproceedings}, year = {2013}, id = {b02499c3-3c7a-3f79-b991-a7a12155f79b}, created = {2018-03-05T18:20:26.065Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.065Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Rahnemoonfar2013a}, private_publication = {false}, abstract = {Geometric distortions are among the major challenging issues in the analysis of historical document images. Such distortions appear as arbitrary warping, folds and page curl, and have detrimental effects upon recognition (OCR) and readability. While there are many dewarping techniques discussed in the literature, there exists no standard method by which their performance can be evaluated against each other. In particular, there is not any satisfactory method capable of comparing the results of existing dewarping techniques on arbitrary wrapped documents. The existing methods either rely on the visual comparison of the output and input images or depend on the recognition rate of an OCR system. In the case of historical documents, OCR either is not available or does not generate an acceptable result. In this paper, an objective and automatic evaluation methodology for document image dewarping technique is presented. In the first step, all the baselines in the original distorted image as well as dewarped image are modelled precisely and automatically. Then based on the mathematical function of each line, a comprehensive metric which calculates the performance of a dewarping technique is introduced. The presented method does not require user interference in any stage of evaluation and therefore is quite objective. Experimental results, applied to two state-of-the art dewarping methods and an industry-standard commercial system, demonstrate the effectiveness of the proposed dewarping evaluation method. Copyright © 2013 by the Association for Computing Machinery, Inc. (ACM).}, bibtype = {inproceedings}, author = {Rahnemoonfar, M. and Plale, B.}, doi = {10.1145/2467696.2467744}, booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries} }
@inproceedings{ title = {Standards for graph algorithm primitives}, type = {inproceedings}, year = {2013}, pages = {1-2}, publisher = {IEEE}, id = {2fa08f30-da73-32af-8211-d68b9009c2d0}, created = {2018-03-05T18:20:26.538Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.538Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Mattson2013}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Mattson, Tim and Bader, David and Berry, Jon and Buluc, Aydin and Dongarra, Jack and Faloutsos, Christos and Feo, John and Gilbert, John and Gonzalez, Joseph and Hendrickson, Bruce}, booktitle = {High Performance Extreme Computing Conference (HPEC), 2013 IEEE} }
@book{ title = {Big data at scale for digital humanities: An architecture for the hathitrust research center}, type = {book}, year = {2013}, source = {Big Data Management, Technologies, and Applications}, id = {b8ae7658-50d5-3e32-974e-96ab17221737}, created = {2018-03-05T18:20:26.753Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.753Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kowalczyk2013}, private_publication = {false}, abstract = {© 2014, IGI Global. All right reserved. Big Data in the humanities is a new phenomenon that is expected to revolutionize the process of humanities research. The HathiTrust Research Center (HTRC) is a cyberinfrastructure to support humanities research on big humanities data. The HathiTrust Research Center has been designed to make the technology serve the researcher to make the content easy to find, to make the research tools efficient and effective, to allow researchers to customize their environment, to allow researchers to combine their own data with that of the HTRC, and to allow researchers to contribute tools. The architecture has multiple layers of abstraction providing a secure, scalable, extendable, and generalizable interface for both human and computational users.}, bibtype = {book}, author = {Kowalczyk, S.T. and Sun, Y. and Peng, Z. and Plale, B. and Todd, A. and Auvil, L. and Willis, C. and Zeng, J. and Pathirage, M. and Liyanage, S. and Ruan, G. and Stephen Downie, J.}, doi = {10.4018/978-1-4666-4699-5.ch011} }
@inproceedings{ title = {Data pipeline in mapreduce}, type = {inproceedings}, year = {2013}, id = {336487ea-b599-3c67-b228-829599138a76}, created = {2018-03-05T18:20:26.921Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.921Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zeng2013}, private_publication = {false}, abstract = {MapReduce is an effective programming model for large scale text and data analysis. Traditional MapReduce implementation, e.g., Hadoop, has the restriction that before any analysis can take place, the entire input dataset must be loaded into the cluster. This can introduce sizable latency when the data set is large, and when it is not possible to load the data once, and process many times - A situation that exists for log files, health records and protected texts for instance. We propose a data pipeline approach to hide data upload latency in MapReduce analysis. Our implementation, which is based on Hadoop MapReduce, is completely transparent to user. It introduces a distributed concurrency queue to coordinate data block allocation and synchronization so as to overlap data upload and execution. The paper overcomes two challenges: A fixed number of maps scheduling and dynamic number of maps scheduling allows for better handling of input data sets of unknown size. We also employ delay scheduler to achieve data locality for data pipeline. The evaluation of the solution on different applications on real world data sets shows that our approach shows performance gains. Copyright © 2013 by The Institute of Electrical and Electronics Engineers, Inc.}, bibtype = {inproceedings}, author = {Zeng, J. and Plale, B.}, doi = {10.1109/eScience.2013.21}, booktitle = {Proceedings - IEEE 9th International Conference on e-Science, e-Science 2013} }
@inproceedings{ title = {Achieving TeraCUPS on Longest Common Subsequence Problem Using GPGPUs}, type = {inproceedings}, year = {2013}, keywords = {2013,pti}, pages = {69-77}, websites = {http://dx.doi.org/10.1109/ICPADS.2013.22}, publisher = {IEEE}, city = {Seoul, South Korea}, id = {0eadd502-d0a9-346f-8e77-0162f1eb72ce}, created = {2018-03-05T18:20:28.266Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.266Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ozsoy2013}, source_type = {inproceedings}, private_publication = {false}, abstract = {In this paper, we describe a novel technique to optimize longest common subsequence (LCS) algorithm for one-to-many matching problem on GPUs by transforming the computation into bit-wise operations and a post-processing step. The former can be highly optimized and achieves more than a trillion operations (cell updates) per second (CUPS)-a first for LCS algorithms. The latter is more efficiently done on CPUs, in a fraction of the bit-wise computation time. The bit-wise step promises to be a foundational step and a fundamentally new approach to developing algorithms for increasingly popular heterogeneous environments that could dramatically increase the applicability of hybrid CPU-GPU environments. © 2013 IEEE.}, bibtype = {inproceedings}, author = {Ozsoy, Adnan and Chauhan, Arun and Swany, Martin}, doi = {10.1109/ICPADS.2013.22}, booktitle = {Parallel and Distributed Systems (ICPADS13), 2013 International Conference on} }
@inproceedings{ title = {Design and Implementation of a Unified Network Information Service}, type = {inproceedings}, year = {2013}, keywords = {2013,pti}, websites = {http://go.iu.edu/nWU}, city = {Santa Clara, CA}, id = {210a5861-0711-39c7-aefc-ef89dccf9f01}, created = {2018-03-05T18:20:28.763Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:28.763Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {El-Hassany2013}, source_type = {inproceedings}, private_publication = {false}, abstract = {A holistic view of the network is key to the successful operation of many distributed, cloud-based, and service-oriented computing architectures. Supporting network-aware applications and application-driven networks requires a detailed representation of network resources, including multi-layer topologies, associated measurement data, and in-the-network service location and availability information. The rapid development of increasingly configurable and dynamic networks has increased the demand for information services that can accurately and efficiently store and expose the state of the network. This work introduces our Unified Network Information Service (UNIS), designed to represent physical and virtual networks and services. We describe the UNIS network data model and its RESTful interface, which provide a common interface to topology, service, and measurement resources. In addition, we describe the security mechanisms built into the UNIS framework. Our analysis of the UNIS implementation shows significant performance and scalability gains over an existing and widely-deployed topology, service registration, and lookup information service architecture. © 2013 IEEE.}, bibtype = {inproceedings}, author = {El-Hassany, Ahmed and Kissel, Ezra and Gunter, Dan and Swany, Martin}, doi = {10.1109/SCC.2013.81}, booktitle = {Proceedings - IEEE 10th International Conference on Services Computing, SCC 2013} }
@inproceedings{ title = {Storm surge simulation and load balancing in azure cloud}, type = {inproceedings}, year = {2013}, volume = {45}, issue = {6}, id = {192f8e57-2480-3d76-ac36-eba8e686440e}, created = {2018-03-05T18:20:29.057Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.057Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chakraborty2013}, private_publication = {false}, abstract = {Cloud computing platforms are drawing increasing attention of the scientific research communities. By providing a framework to lease computation resources, cloud computing enables the scientists to carry out large-scale experiments in a cost-effective fashion without incurring high setup and maintenance costs of a large compute system. In this paper, we study the implementation and scalability issues in deploying a particular class of computational science applications. Using Platform-as-a-Service (PAAS) of Windows Azure cloud, we implement a high-throughput Storm-Surge Simulation in both a middleware framework for deploying jobs (in cloud and grid environment) and a MapReduce framework - a data parallel programming model for processing large data sets. We present the detailed techniques to balance the simulation loads while parallelizing the application across a large number of nodes.}, bibtype = {inproceedings}, author = {Chakraborty, A. and Pathirage, M. and Suriarachchi, I. and Chandrasekar, K. and Mattocks, C. and Plale, B.}, booktitle = {Simulation Series} }
@inproceedings{ title = {Milieu: Lightweight and configurable big data provenance for science}, type = {inproceedings}, year = {2013}, id = {ebfc81ea-492c-3be4-a082-1289789c61f0}, created = {2018-03-05T18:20:29.100Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.100Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Cheah2013}, private_publication = {false}, abstract = {The volume and complexity of data produced and analyzed in scientific collaborations is growing exponentially. It is important to track scientific data-intensive analysis workflows to provide context and reproducibility as data is transformed in these collaborations. Provenance addresses this need and aids scientists by providing the lineage or history of how data is generated, used and modified. Provenance has traditionally been collected at the workflow level often making it hard to capture relevant information about resource characteristics and is difficult for users to easily incorporate in existing workflows. In this paper, we describe Milieu, a framework focused on the collection of provenance for scientific experiments in High Performance Computing systems. Our approach collects provenance in a minimally intrusive way without significantly impacting the performance of the execution of scientific workflows. We also provide fidelity to our provenance collection by allowing users to specify three levels of provenance collection. We evaluate our framework on systems at the National Energy Research Scientific Computing Center (NERSC) and show that the overhead is less than the variation already experienced by these applications in these shared environments. © 2013 IEEE.}, bibtype = {inproceedings}, author = {Cheah, Y.-W. and Canon, R. and Plale, B. and Ramakrishnan, L.}, doi = {10.1109/BigData.Congress.2013.16}, booktitle = {Proceedings - 2013 IEEE International Congress on Big Data, BigData 2013} }
@techreport{ title = {Repository of NSF Funded Publications and Data Sets:" Back of Envelope" 15 year Cost Estimate}, type = {techreport}, year = {2013}, id = {d3bf2900-48e2-392b-a6f7-93228952fa91}, created = {2018-03-05T18:20:29.123Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:29.123Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2013c}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth and Kouper, Inna and McDonald, Robert and Seiffert, Kurt and Konkiel, Stacy} }
@inproceedings{ title = {Hierarchical MapReduce programming model and scheduling algorithms}, type = {inproceedings}, year = {2012}, id = {48db13c5-26bd-3e44-b0b7-38469121f18e}, created = {2018-03-05T18:20:21.155Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.155Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Luo2012}, private_publication = {false}, abstract = {We present a Hierarchical MapReduce framework that gathers computation resources from different clusters and runs MapReduce jobs across them. The applications implemented in this framework adopt the Map-Reduce-Global Reduce model where computations are expressed as three functions: Map, Reduce, and Global Reduce. Two scheduling algorithms are introduced: Compute Capacity Aware Scheduling for compute-intensive jobs and Data Location Aware Scheduling for data-intensive jobs. Experimental evaluations using a molecule binding prediction tool, Auto Dock, and grep demonstrate promising results for our framework. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Luo, Y. and Plale, B.}, doi = {10.1109/CCGrid.2012.132}, booktitle = {Proceedings - 12th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing, CCGrid 2012} }
@techreport{ title = {48 Month Program Report}, type = {techreport}, year = {2012}, id = {a2cddcae-7ee9-39f1-9489-3fcfba9c930e}, created = {2018-03-05T18:20:21.999Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:21.999Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {McRobbie2012}, source_type = {JOUR}, private_publication = {false}, bibtype = {techreport}, author = {McRobbie, Michael A and Wheeler, Bradley C and Plale, Beth A and Stewart, Craig A} }
@inproceedings{ title = {Managing the long tail of science: Data and communities}, type = {inproceedings}, year = {2012}, id = {493d35c3-55b3-3708-abea-89327a2357f7}, created = {2018-03-05T18:20:22.372Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.372Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Plale2012c}, private_publication = {false}, abstract = {We describe the origins of the long tail of data, and discuss the form it takes for scientific and scholarly data. This has implications and opportunities for managing the long tail that are articulated as a set of questions posed as a starting point for discussions by a panel at XSEDE 2012. © 2012 Author.}, bibtype = {inproceedings}, author = {Plale, B.}, doi = {10.1145/2335755.2335866}, booktitle = {ACM International Conference Proceeding Series} }
@inproceedings{ title = {Evaluating high performance data transfer with RDMA-based protocols in wide-area networks}, type = {inproceedings}, year = {2012}, id = {0d9f68f1-b3db-3a4e-a39e-f52e2ff34e42}, created = {2018-03-05T18:20:22.449Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.449Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kissel2012b}, private_publication = {false}, abstract = {The use of zero-copy RDMA is a promising area of development in support of high-performance data movement over wide-area networks. In particular, the emerging RDMA over Converged Ethernet (RoCE) standard enables the InfiniBand transport for use over existing and widely deployed network infrastructure. In this paper, we evaluate the use of RDMA over Ethernet in two deployment scenarios: 1) a gateway approach that adapts standard application connections to an RDMA-based protocol for transmission over wide-area network paths, and 2) the integration of our RDMA implementation into GridFTP, a popular data transfer tool for distributed computing. We evaluate both approaches over a number of wide-area network conditions emulated using a commercial network emulation device, and we analyze the overhead of our RDMA implementations from a systems perspective. Our results show a significant increase in network utilization and performance when using RDMA over high-latency paths with a reduced CPU and memory I/O footprint on our gateways and end host applications. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Kissel, E. and Swany, M.}, doi = {10.1109/HPCC.2012.113}, booktitle = {Proceedings of the 14th IEEE International Conference on High Performance Computing and Communications, HPCC-2012 - 9th IEEE International Conference on Embedded Software and Systems, ICESS-2012} }
@techreport{ title = {Validating linux network emulation}, type = {techreport}, year = {2012}, publisher = {Technical Report UDEL-2012/004, Dept. of CIS, University of Delaware}, id = {0ce81e52-1375-3958-8cdb-cb280e01258b}, created = {2018-03-05T18:20:22.548Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.548Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kissel2012}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Kissel, Ezra and Swany, Martin} }
@inproceedings{ title = {The challenges and opportunities of workflow systems in environmental research}, type = {inproceedings}, year = {2012}, pages = {1-5}, id = {e5e383f3-c296-37b1-8f19-3f4cfe9b3dab}, created = {2018-03-05T18:20:22.826Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:22.826Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2012a}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, Beth}, booktitle = {WIRADA Science Symposium Proceedings, Melbourne, Australia, August} }
@inproceedings{ title = {Efficient data transfer protocols for big data}, type = {inproceedings}, year = {2012}, id = {c6a6dbc1-cfde-3b82-b4d0-eaa0281ad4fd}, created = {2018-03-05T18:20:23.050Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.050Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Tierney2012}, private_publication = {false}, abstract = {Data set sizes are growing exponentially, so it is important to use data movement protocols that are the most efficient available. Most data movement tools today rely on TCP over sockets, which limits flows to around 20Gbps on today's hardware. RDMA over Converged Ethernet (RoCE) is a promising new technology for high-performance network data movement with minimal CPU impact over circuit-based infrastructures.We compare the performance of TCP, UDP, UDT, and RoCE over high latency 10Gbps and 40Gbps network paths, and show that RoCE-based data transfers can fill a 40Gbps path using much less CPU than other protocols. We also show that the Linux zero-copy system calls can improve TCP performance considerably, especially on current Intel "Sandy Bridge"-based PCI Express 3.0 (Gen3) hosts. ©2012 IEEE.}, bibtype = {inproceedings}, author = {Tierney, B. and Kissel, E. and Swany, M. and Pouyoul, E.}, doi = {10.1109/eScience.2012.6404462}, booktitle = {2012 IEEE 8th International Conference on E-Science, e-Science 2012} }
@inproceedings{ title = {Provenance analysis: Towards quality provenance}, type = {inproceedings}, year = {2012}, id = {9f2d9f02-0cb2-33dc-94d2-4f5283700477}, created = {2018-03-05T18:20:23.131Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.131Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Cheah2012}, private_publication = {false}, abstract = {Data provenance, a key piece of metadata that describes the lifecycle of a data product, is crucial in aiding scientists to better understand and facilitate reproducibility and reuse of scientific results. Provenance collection systems often capture provenance on the fly and the protocol between application and provenance tool may not be reliable. As a result, data provenance can become ambiguous or simply inaccurate. In this paper, we identify likely quality issues in data provenance. We also establish crucial quality dimensions that are especially critical for the evaluation of provenance quality. We analyze synthetic and real-world provenance based on these quality dimensions and summarize our contributions to provenance quality. ©2012 IEEE.}, bibtype = {inproceedings}, author = {Cheah, Y.-W. and Plale, B.}, doi = {10.1109/eScience.2012.6404480}, booktitle = {2012 IEEE 8th International Conference on E-Science, e-Science 2012} }
@inproceedings{ title = {Driving software defined networks with XSP}, type = {inproceedings}, year = {2012}, id = {735b6f71-09e6-3940-8bc7-b7635dd1d4b9}, created = {2018-03-05T18:20:23.306Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.306Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kissel2012d}, private_publication = {false}, abstract = {This paper presents the eXtensible Session Protocol (XSP), which provides a control plane for driving Software Defined Networks (SDNs). The XSP model supports proactive, application-driven configuration of dynamic network resources with support for authentication and authorization, within an extensible protocol framework. We describe XSP application use cases in SDNs using OpenFlow enabled network devices as well as dynamic forwarding rule management that can be implemented on existing router platforms. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Kissel, E. and Fernandes, G. and Jaffee, M. and Swany, M. and Zhang, M.}, doi = {10.1109/ICC.2012.6364805}, booktitle = {IEEE International Conference on Communications} }
@inproceedings{ title = {Scalable integrated performance analysis of multi-gigabit networks}, type = {inproceedings}, year = {2012}, id = {888f0a63-76ff-3ba0-8672-54e6a4b0a75a}, created = {2018-03-05T18:20:23.574Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.574Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kissel2012c}, private_publication = {false}, abstract = {Monitoring and managing multi-gigabit networks requires dynamic adaptation to end-to-end performance characteristics. This paper presents a measurement collection and analysis framework that automates the troubleshooting of end-to-end network bottlenecks. We integrate real-time host, application, and network measurements with a common representation (compatible with perfSONAR) within a flexible and scalable architecture. Our measurement architecture is supported by a light-weight eXtensible Session Protocol (XSP), which enables context-sensitive adaptive measurement collection. We evaluate the ability of our system to analyze and detect bottleneck conditions over a series of high-speed and I/O intensive bulk data transfer experiments and find that the overhead of the system is very low and that we are able to detect and understand a variety of bottlenecks. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Kissel, E. and El-Hassany, A. and Fernandes, G. and Swany, M. and Gunter, D. and Samak, T. and Schopf, J.M.}, doi = {10.1109/NOMS.2012.6212056}, booktitle = {Proceedings of the 2012 IEEE Network Operations and Management Symposium, NOMS 2012} }
@inproceedings{ title = {Effectiveness of hybrid workflow systems for computational science}, type = {inproceedings}, year = {2012}, volume = {9}, id = {10c51492-6f43-37a3-8852-6ee4cceab7e2}, created = {2018-03-05T18:20:23.923Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:23.923Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Plale2012d}, private_publication = {false}, abstract = {The workflow and its supporting systems are integral to computational science. Tailored to loosely coupled, and largely coarse-grained tasks, the workflow replaces the script as a way to automate the multiple steps of a large scale model. Workflow reuse has been at the subworkflow level but this restricts, over the long run, a workflow to running on the system on which it was developed. A scientist wanting to use two workflows developed by two different people and for different workflow systems will need to have access to both workflow systems. The contribution this paper makes is a qualitative and quantitative study of the tradeoffs of a hybrid workflow solution that utilizes multiple workflow systems and solutions to execute a single workflow. Our results indicate that the major tradeoffs are not in performance as much as they are in complexity. © 2012 Published by Elsevier Ltd.}, bibtype = {inproceedings}, author = {Plale, B. and Withana, E.C. and Herath, C. and Chandrasekar, K. and Luo, Y.}, doi = {10.1016/j.procs.2012.04.054}, booktitle = {Procedia Computer Science} }
@article{ title = {SEAD Virtual Archive: Building a Federation of Institutional Repositories for Long Term Data Preservation}, type = {article}, year = {2012}, volume = {8}, publisher = {Digital Curation Centre}, id = {be40aa81-789f-3199-8091-4a91d20c46ca}, created = {2018-03-05T18:20:24.101Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.101Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2012}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Plale, Beth and McDonald, Robert H and Chandrasekar, Kavitha and Kouper, Inna and Konkiel, Stacy and Hedstrom, Margaret L and Myers, Jim and Kumar, Praveen}, doi = {https://doi.org/10.2218/ijdc.v8i2.281}, journal = {International Journal of Digital Curation}, number = {2} }
@inproceedings{ title = {Temporal representation for scientific data provenance}, type = {inproceedings}, year = {2012}, id = {124ce287-648d-33fb-8b89-a9d9114e8531}, created = {2018-03-05T18:20:24.409Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.409Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2012d}, private_publication = {false}, abstract = {Provenance of digital scientific data is an important piece of the metadata of a data object. It can however grow voluminous quickly because the granularity level of capture can be high. It can also be quite feature rich. We propose a representation of the provenance data based on logical time that reduces the feature space. Creating time and frequency domain representations of the provenance, we apply clustering, classification and association rule mining to the abstract representations to determine the usefulness of the temporal representation. We evaluate the temporal representation using an existing 10 GB database of provenance captured from a range of scientific workflows. ©2012 IEEE.}, bibtype = {inproceedings}, author = {Chen, P. and Plale, B. and Aktas, M.S.}, doi = {10.1109/eScience.2012.6404477}, booktitle = {2012 IEEE 8th International Conference on E-Science, e-Science 2012} }
@book{ title = {Second International Workshop on Traceability and Compliance of Semi-Structured Processes (TC4SP 2011)}, type = {book}, year = {2012}, source = {Lecture Notes in Business Information Processing}, volume = {99 LNBIP}, issue = {PART 1}, id = {131e168f-556a-3382-ac09-c558ca239eb4}, created = {2018-03-05T18:20:24.699Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:24.699Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Curbera2012}, private_publication = {false}, bibtype = {book}, author = {Curbera, F. and Leymann, F. and Nezhad, H.R.M. and Plale, B.} }
@article{ title = {The extensible session protocol: A protocol for future internet architectures}, type = {article}, year = {2012}, id = {39991b1c-b160-3d98-bc76-bc780f4825c0}, created = {2018-03-05T18:20:25.180Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.180Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kissel2012a}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Kissel, Ezra and Swany, Martin}, journal = {Rapport technique} }
@article{ title = {Sigiri: Uniform resource abstraction for grids and clouds}, type = {article}, year = {2012}, volume = {24}, id = {c2e74c00-8199-39b0-ae1c-524f0bbdd624}, created = {2018-03-05T18:20:25.361Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:25.361Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Withana2012}, private_publication = {false}, abstract = {With the maturation of grid computing facilities and recent explosion of cloud computing data centers, midscale computational science has more options than ever before to satisfy computational needs. But heterogeneity brings complexity. We propose a simple abstraction for interaction with heterogeneous resource managers spanning grid and cloud computing and on features that make the tool useful for the midscale physical or natural scientist. Key strengths of the abstraction are its support for multiple standard job specification languages, preservation of direct user interaction with the service, removing the delay that can come through layers of services, and the predictable behavior under heavy loads. Copyright © 2012 John Wiley & Sons, Ltd. Copyright © 2012 John Wiley & Sons, Ltd.}, bibtype = {article}, author = {Withana, E.C. and Plale, B.}, doi = {10.1002/cpe.2823}, journal = {Concurrency Computation Practice and Experience}, number = {18} }
@inproceedings{ title = {Exploiting network parallelism for improving data transfer performance}, type = {inproceedings}, year = {2012}, id = {82f0c19f-65fc-395f-b5ea-237b8170989c}, created = {2018-03-05T18:20:26.479Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.479Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Gunter2012}, private_publication = {false}, abstract = {Many scientific applications, including bulk data transfer, can achieve significantly higher performance from vir- tually loss-free dedicated resources provisioned on shared links, than from opportunistic network use. Research and Education (R & amp;E) backbones, including the Energy Sciences Network and Internet2, provide general-purpose services to allocate dedi- cated bandwidth. However, in order to fully take advantage of this technology, applications need to move from coarse-grained 'reservation' strategies, to more sophisticated control based on software defined networking (SDN) with technologies such as OpenFlow. We propose here, as one practical step in this direction, using multiple paths for the same application transfer session. This can add bandwidth from 'best effort' and dedicated networks, and can also facilitate performance with applications using multiple 10G NICs over 100G capable paths. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Gunter, D. and Kettimuthu, R. and Kissel, E. and Swany, M. and Yi, J. and Zurawski, J.}, doi = {10.1109/SC.Companion.2012.337}, booktitle = {Proceedings - 2012 SC Companion: High Performance Computing, Networking Storage and Analysis, SCC 2012} }
@inproceedings{ title = {Middleware alternatives for storm surge predictions in Windows Azure}, type = {inproceedings}, year = {2012}, id = {b4ece792-f5bf-3d1a-8e9f-c35ee2721742}, created = {2018-03-05T18:20:26.536Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.536Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chandrasekar2012}, private_publication = {false}, abstract = {Cloud computing is a resource of significant value to computational science, but has proven itself to be not immediately realizable by the researcher. The cloud providers that offer a Platform-as-a-Service (PaaS) platform should, in theory, offer a sound alternative to infrastructure-as-a- service as it could be easier to take advantage of for computational science kinds of problems. The objective of our study is to assess how well the Azure platform as a service can serve a particular class of computational science application. We conduct a performance evaluation using three approaches to executing a high-throughput storm surge application: using Sigiri, a large scale resource abstraction tool, Windows Azure HPC scheduler, and Daytona, an Iterative Map-reduce runtime for Azure. The differences in the approaches including early performance measures for up to 500 instances are discussed. Copyright © 2012 ACM.}, bibtype = {inproceedings}, author = {Chandrasekar, K. and Pathirage, M. and Wijeratne, S. and Mattocks, C. and Plale, B.}, doi = {10.1145/2287036.2287040}, booktitle = {ScienceCloud '12 - 3rd Workshop on Scientific Cloud Computing} }
@inproceedings{ title = {Pipelined parallel LZSS for streaming data compression on GPGPUs}, type = {inproceedings}, year = {2012}, id = {42d49cbe-9d1c-3e6c-acdb-1fc8597adc74}, created = {2018-03-05T18:20:26.667Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.667Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ozsoy2012}, private_publication = {false}, abstract = {In this paper, we present an algorithm and provide design improvements needed to port the serial Lempel-Ziv-Storer-Szymanski (LZSS), lossless data compression algorithm, to a parallelized version suitable for general purpose graphic processor units (GPGPU), specifically for NVIDIA's CUDA Framework. The two main stages of the algorithm, substring matching and encoding, are studied in detail to fit into the GPU architecture. We conducted detailed analysis of our performance results and compared them to serial and parallel CPU implementations of LZSS algorithm. We also benchmarked our algorithm in comparison with well known, widely used programs; GZIP and ZLIB. We achieved up to 34x better throughput than the serial CPU implementation of LZSS algorithm and up to 2.21x better than the parallelized version. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Ozsoy, A. and Swany, M. and Chauhan, A.}, doi = {10.1109/ICPADS.2012.16}, booktitle = {Proceedings of the International Conference on Parallel and Distributed Systems - ICPADS} }
@article{ title = {Mining classifications from social-ecological databases}, type = {article}, year = {2012}, volume = {49}, id = {62d443dc-31c4-3ce5-8ba1-194d74de4be0}, created = {2018-03-05T18:20:26.760Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:26.760Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Jensen2012}, private_publication = {false}, abstract = {Social-ecological research is characteristic of long-tail science, with many region-specific studies of social and ecological phenomena that collectively yield a large volume of highly heterogeneous, small data sets. This variability makes it difficult to determine the applicability of a particular data set for a new research question, hindering the reuse of data that has been often collected through extensive effort. In this paper we present results of automatic classification of socio-ecological data into categories defined by a domain model called the SES Framework. We have applied our methods to the classification of a relational database containing over 18 years of research on forest systems. Our preliminary results suggest that decision tree-based classifiers along with textual features perform well at this task. Furthermore, social-ecological data sets are found to exhibit distinct classification features in that the results are promising even for classes that comprise a relatively small portion of the database.}, bibtype = {article}, author = {Jensen, S. and Chen, M. and Liu, X. and Plale, B. and Leake, D.}, doi = {10.1002/meet.14504901301}, journal = {Proceedings of the ASIST Annual Meeting}, number = {1} }
@inproceedings{ title = {Opening Data in the Long Tail for Community Discovery, Curation and Action Using Active and Social Curation}, type = {inproceedings}, year = {2012}, id = {e256df9d-0fdf-3760-83be-c91808c78550}, created = {2018-03-05T18:20:27.081Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-03-05T18:20:27.081Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Hedstrom2012}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Hedstrom, M L and Kumar, P and Myers, J and Plale, B A}, booktitle = {AGU Fall Meeting Abstracts} }
@inproceedings{ title = {Active and Social Data Curation: Reinventing the Business of Community-scale Lifecycle Data Management}, type = {inproceedings}, year = {2012}, id = {7f83dca4-ec54-3d0b-929c-402b2b7be6a2}, created = {2018-03-05T18:20:27.537Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:44.770Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {McDonald2012}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {McDonald, Robert H. and Kumar, P and Plale, Beth A. and Myers, James D. and Hedstrom, Margaret L.}, booktitle = {AGU Fall Meeting Abstracts} }
@inproceedings{ title = {Visualization of network data provenance}, type = {inproceedings}, year = {2012}, id = {a13323bd-d227-3a94-93d7-18edbefd136e}, created = {2018-03-05T18:20:28.383Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.560Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2012a}, private_publication = {false}, abstract = {Visualization facilitates the understanding of scientific data both through exploration and explanation of the visualized data. Provenance also contributes to the understanding of data by containing the contributing factors behind a result. The visualization of provenance, although supported in existing workflow management systems, generally focuses on small (medium) sized provenance data, lacking techniques to deal with big data with high complexity. This paper discusses visualization techniques developed for exploration and explanation of provenance, including layout algorithm, visual style, graph abstraction techniques, and graph matching algorithm, to deal with the high complexity. We demonstrate through application to two extensively analyzed case studies that involved provenance capture and use over three year projects, the first involving provenance of a satellite imagery ingest processing pipeline and the other of provenance in a large-scale computer network testbed. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Chen, P. and Plale, B. and Cheah, Y.-W. and Ghoshal, D. and Jensen, S. and Luo, Y.}, doi = {10.1109/HiPC.2012.6507517}, booktitle = {19th International Conference on High Performance Computing, HiPC 2012} }
@inproceedings{ title = {From metadata to ontology representation: A case of converting severe weather forecast metadata to an ontology}, type = {inproceedings}, year = {2012}, volume = {49}, issue = {1}, id = {78cc639e-790f-32ad-ab60-943382a162e6}, created = {2018-03-05T18:20:28.773Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.138Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2012b}, private_publication = {false}, abstract = {With the increasing amount of data generated in geoscience research, it becomes critical to describe data sets in meaningful ways. A large number of described data sets are described using XML metadata, which has proved a useful means of expressing data characteristics. An ontological representation is another way of representing data sets with the benefit of providing rich semantics, convenient linkage to other data sets, and good interoperability with other data. This study represents geoscience data sets as an ontology based on an existing metadata description and on the nature of the data set. It takes the case of Vortex2 data, a regional weather forecast data set collected in Summer 2010, to showcase how forecast data can be represented in ontology by using the existing metadata information. It supplies another type of representation of the data set with added semantics and potential functionalities compared to the previous metadata representation. Copyright © 2012 by American Society for Information Science and Technology.}, bibtype = {inproceedings}, author = {Chen, M. and Plale, B.}, doi = {10.1002/meet.14504901286}, booktitle = {Proceedings of the ASIST Annual Meeting} }
@techreport{ title = {2012 Annual Report on Training, Education, and Outreach Activities of the Indiana University Pervasive Technology Institute and affiliated organizations}, type = {techreport}, year = {2012}, institution = {Pervasive Technology Institute, Indiana University Bloomington}, id = {627d34ff-0f8a-3258-81d5-092112b7adbb}, created = {2018-03-05T18:20:28.852Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.143Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {mpps12}, source_type = {techreport}, private_publication = {false}, bibtype = {techreport}, author = {Miller, T and Ping, Robert J and Plale, Beth A. and Stewart, Craig A.} }
@techreport{ title = {HathiTrust Research Center: Computational Research on the HathiTrust Repository}, type = {techreport}, year = {2012}, websites = {https://scholarworks.iu.edu/dspace/handle/2022/14133}, institution = {Indiana University}, id = {00a9f44d-6cf0-3c64-a2ea-7820175e2e69}, created = {2018-03-05T18:20:28.857Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.455Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2012b}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth A and Poole, Marshall Scott and McDonald, Robert H. and Unsworth, John} }
@inproceedings{ title = {Generalized representation and mapping for social-ecological data: Freeing data from the database}, type = {inproceedings}, year = {2012}, id = {a2a96004-e0ea-31ed-9050-31e074785350}, created = {2018-03-05T18:20:29.269Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:45.367Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jensen2012a}, private_publication = {false}, abstract = {Scientific discovery increasingly requires collaboration between scientific sub-domains that often have different representations for their data. To bridge gaps between varying domain representations, researchers are developing metadata and semantic representations meaningful to broader communities. Through exploiting these representations we propose a logical model and architecture by which cross-domain researchers can more easily discover, use, and eventually archive, data. In this paper we present an architecture, intermediate data model, and methodology for mapping diverse social-ecological data sources stored in relational databases to a common representation, and for classifying textual data using machine learning. The results are visualized through client views that are built against the general logical model, and applied against a longitudinal database from social-ecological research. ©2012 IEEE.}, bibtype = {inproceedings}, author = {Jensen, S. and Plale, B. and Liu, X. and Chen, M. and Leake, D. and England, J.}, doi = {10.1109/eScience.2012.6404486}, booktitle = {2012 IEEE 8th International Conference on E-Science, e-Science 2012} }
@techreport{ title = {Temporal Data Mining of Scientific Data Provenance}, type = {techreport}, year = {2012}, source = {Indiana University Computer Science Technique Report. TR701}, id = {d40357fb-5149-31ec-9a56-6258326fe03d}, created = {2018-03-05T18:20:29.382Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.046Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2012}, source_type = {JOUR}, private_publication = {false}, bibtype = {techreport}, author = {Chen, Peng and Plale, Beth and Aktas, Mehmet} }
@inproceedings{ title = {Visualizing large scale scientific data provenance}, type = {inproceedings}, year = {2012}, id = {dfb40118-b9be-3ca4-b04b-4c35056bf258}, created = {2018-03-05T18:20:29.566Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.496Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2012c}, private_publication = {false}, abstract = {Visualization increases the understanding of scientific data by facilitating exploration and explanation of the data. Provenance contributes to data understanding by exposing contributing factors that went in to producing a particular research result. However, provenance of scientific data can grow voluminous quickly because of the large amount of (intermediate) data and ever-increasing complexity. While previous research on visualizing provenance data focuses on small to medium sized provenance data, we develop visualization techniques for exploration and explanation of large scale provenance, including layout algorithm, visual style, graph abstraction techniques, graph matching algorithm, and temporal representation technique to deal with the high complexity. © 2012 IEEE.}, bibtype = {inproceedings}, author = {Chen, P. and Plale, B.}, doi = {10.1109/SC.Companion.2012.205}, booktitle = {Proceedings - 2012 SC Companion: High Performance Computing, Networking Storage and Analysis, SCC 2012} }
@inproceedings{ title = {Spatial Data in an Ontology for Research on Forest Resources}, type = {inproceedings}, year = {2011}, pages = {28-30}, id = {65504d43-598f-39a0-9204-9439ed5f182a}, created = {2018-03-05T18:20:21.287Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.471Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jensen2011}, source_type = {JOUR}, private_publication = {false}, bibtype = {inproceedings}, author = {Jensen, Scott and Cox, Michael and Bender, David and Chen, Miao and England, Julie and Plale, Beth and Leake, David}, booktitle = {COSIT'11 Workshop} }
@article{ title = {The Open Provenance Model core specification (v1.1)}, type = {article}, year = {2011}, volume = {27}, id = {060f5e7b-d61a-3970-90dd-da967cc0c2c2}, created = {2018-03-05T18:20:21.634Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.846Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Moreau2011}, private_publication = {false}, abstract = {The Open Provenance Model is a model of provenance that is designed to meet the following requirements: (1) Allow provenance information to be exchanged between systems, by means of a compatibility layer based on a shared provenance model. (2) Allow developers to build and share tools that operate on such a provenance model. (3) Define provenance in a precise, technology-agnostic manner. (4) Support a digital representation of provenance for any "thing", whether produced by computer systems or not. (5) Allow multiple levels of description to coexist. (6) Define a core set of rules that identify the valid inferences that can be made on provenance representation. This document contains the specification of the Open Provenance Model (v1.1) resulting from a community effort to achieve inter-operability in the Provenance Challenge series. © 2011 Elsevier B.V. All rights reserved.}, bibtype = {article}, author = {Moreau, L. and Clifford, B. and Freire, J. and Futrelle, J. and Gil, Y. and Groth, P. and Kwasnikowska, N. and Miles, S. and Missier, P. and Myers, J. and Plale, B. and Simmhan, Y. and Stephan, E. and Den Bussche, J.V.}, doi = {10.1016/j.future.2010.07.005}, journal = {Future Generation Computer Systems}, number = {6} }
@book{ title = {Data provenance for preservation of digital geoscience data}, type = {book}, year = {2011}, source = {Special Paper of the Geological Society of America}, volume = {482}, id = {13196106-f958-3aec-beb6-04ac7c07ed0d}, created = {2018-03-05T18:20:22.016Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.642Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2011d}, private_publication = {false}, abstract = {The first step in the preservation of digital scientific data is gathering enough information "about" a scientific outcome or data collection so that it can be discovered and used a decade later as easily as it is used at the time. Data provenance, or lineage of a collection, can capture the way in which a particular scientific collection was created, when, and by whom. Tools that automate the collection of provenance can reduce the burden on the researcher, and provenance data can be stored in ways that make the data more amenable to long-term preservation. We discuss the various dimensions of data provenance in data-driven geospatial science with the goal of conveying a good grasp of provenance collection, representation, and use. Our research in data cyberinfrastructure utilizes real-time observational data in on-demand weather forecasts, and we discuss this aspect as well. © 2011 The Geological Society of America. All rights reserved.}, bibtype = {book}, author = {Plale, B. and Cao, B. and Herath, C. and Sun, Y.}, doi = {10.1130/2011.2482(11)} }
@article{ title = {Key Provenance of Earth Science Observational Data Products}, type = {article}, year = {2011}, id = {66920cfa-b19e-3c58-9738-9c51a1f5aa81}, created = {2018-03-05T18:20:22.582Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.021Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Aktas2011}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Aktas, Mehmet and Plale, Beth and Conover, Helen and Purohit, Prajakta}, journal = {American Geophysical Union}, number = {Fall Meeting 2011} }
@inproceedings{ title = {A noisy 10GB provenance database}, type = {inproceedings}, year = {2011}, pages = {370-381}, volume = {100 LNBIP}, issue = {PART 2}, publisher = {Springer}, id = {8cc73a8b-6904-326b-85e6-597769a8053c}, created = {2018-03-05T18:20:23.477Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:44.479Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cheah2011}, source_type = {CONF}, private_publication = {false}, abstract = {Provenance of scientific data is a key piece of the metadata record for the data's ongoing discovery and reuse. Provenance collection systems capture provenance on the fly, however, the protocol between application and provenance tool may not be reliable. Consequently, the provenance record can be partial, partitioned, and simply inaccurate. We use a workflow emulator that models faults to construct a large 10GB database of provenance that we know is noisy (that is, has errors). We discuss the process of generating the provenance database, and show early results on the kinds of provenance analysis enabled by the large provenance. © 2012 Springer-Verlag.}, bibtype = {inproceedings}, author = {Cheah, You-Wei Y.-W. and Plale, Beth and Kendall-Morwick, Joey and Leake, David and Ramakrishnan, Lavanya}, doi = {10.1007/978-3-642-28115-0_35}, booktitle = {International Conference on Business Process Management} }
@inproceedings{ title = {Understanding the effects of boundary layer and synoptic meteorology on new particle formation based on WRF simulations and measurements in Southern Indiana}, type = {inproceedings}, year = {2011}, id = {85612e01-78e3-38c8-ad2b-d97615fc99ba}, created = {2018-03-05T18:20:25.049Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:45.089Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Crippa2011}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Crippa, P and El Afandi, G and Plale, B and Pryor, S C}, booktitle = {AGU Fall Meeting Abstracts} }
@article{ title = {Provenance Capture of Unmanaged Workflows with Karma}, type = {article}, year = {2011}, id = {f55108fb-cb3e-342f-9dfd-61e14c5f88cf}, created = {2018-03-05T18:20:25.482Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.917Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2011c}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Plale, B and Cao, B and Aktas, M}, journal = {Bloomington, IN, Indiana University} }
@techreport{ title = {Atmospheric sciences and informatics EarthCube driver whitepaper: Technical infrastructure}, type = {techreport}, year = {2011}, id = {98570426-5121-3d15-9240-7897115a3a67}, created = {2018-03-05T18:20:25.733Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.175Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2011b}, source_type = {JOUR}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth and Clark, Rich and Mattocks, Craig and Brewster, Keith and Barthelmie, Rebecca and Droegemeier, Kelvin and Gannon, Dennis and Graves, Sara and Jensen, Scott and Mahoney, William} }
@inproceedings{ title = {A hierarchical framework for cross-domain MapReduce execution}, type = {inproceedings}, year = {2011}, id = {11cb20ac-5adf-3bce-ad3f-4e45319cfaa1}, created = {2018-03-05T18:20:26.396Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.689Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Luo2011}, private_publication = {false}, abstract = {The MapReduce programming model provides an easy way to execute pleasantly parallel applications. Many data-intensive life science applications fit this programming model and benefit from the scalability that can be delivered using this model. One such application is AutoDock, which consists of a suite of automated tools for predicting the bound conformations of flexible ligands to macromolecular targets. However, researchers also need sufficient computation and storage resources to fully enjoy the benefit of MapReduce. For example, a typical AutoDock based virtual screening experiment usually consists of a very large number of docking processes from multiple ligands and is often time consuming to run on a single MapReduce cluster. Although commercial clouds can provide virtually unlimited computation and storage resources on-demand, due to financial, security and possibly other concerns, many researchers still run experiments on a number of small clusters with limited number of nodes that cannot unleash the full power of MapReduce. In this paper, we present a hierarchical MapReduce framework that gathers computation resources from different clusters and run MapReduce jobs across them. The global controller in our framework splits the data set and dispatches them to multiple "local" MapReduce clusters, and balances the workload by assigning tasks in accordance to the capabilities of each cluster and of each node. The local results are then returned back to the global controller for global reduction. Our experimental evaluation using AutoDock over MapReduce shows that our load-balancing algorithm makes promising workload distribution across multiple clusters, and thus minimizes overall execution time span of the entire MapReduce execution. © Copyright 2011 ACM.}, bibtype = {inproceedings}, author = {Luo, Y. and Guo, Z. and Sun, Y. and Plale, B. and Qiu, J. and Li, W.W.}, doi = {10.1145/1996023.1996026}, booktitle = {ECMLS'11 - Proceedings of the 2nd International Workshop on Emerging Computational Methods for the Life Sciences} }
@article{ title = {Using provenance for personalized quality ranking of scientific datasets}, type = {article}, year = {2011}, volume = {18}, id = {e7d3c7ff-a0a9-3c09-905c-7010f8d1b94d}, created = {2018-03-05T18:20:26.728Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.816Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Simmhan2011}, private_publication = {false}, abstract = {The rapid growth of eScience has led to an explosion in the creation and availability of scientific datasets that includes raw instrument data and derived datasets from model simulations. A large number of these datasets are surfacing online in public and private catalogs, often annotated with XML metadata, as part of community efforts to foster open research. With this rapid expansion comes the challenge of filtering and selecting datasets that best match the needs of scientists. We address a key aspect of the scientific data discovery process by ranking search results according to a personalized data quality score based on a declarative quality profile to help scientists select the most suitable data for their applications. Our quality model is resilient to missing metadata using a novel strategy that uses provenance in its absence. Intuitively, our premise is that the quality score for a dataset depends on its provenance - the scientific task and its inputs that created the dataset - and it is possible to define a quality function based on provenance metadata that predicts the same quality score as one evaluated using the user's quality profile over the complete metadata. Here we present a model and architecture for data quality scoring, apply machine learning techniques to construct a quality function that uses provenance as proxy for missing metadata, and empirically test the prediction power of our quality function. Our results show that for some scientific tasks, quality scores based on provenance closely track the quality scores based on complete metadata properties with error margins between 1-29 percent.}, bibtype = {article}, author = {Simmhan, Y. and Plale, B.}, journal = {International Journal of Computers and their Applications}, number = {3} }
@inproceedings{ title = {Programming abstraction for resource aware stream processing for scientific workflows}, type = {inproceedings}, year = {2011}, id = {13040fd3-c2eb-3285-86e6-1bfc14b88d43}, created = {2018-03-05T18:20:27.115Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.848Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Herath2011a}, private_publication = {false}, abstract = {As the volume of real time data available for use in scientific discovery explodes, the limiting factor is increasingly the amount of time and attention a scientist can give to a problem. Processing event streams from heterogeneous sources in real time adds a dimension to e-Science workflow systems that is less well understood. Considering the types of computations that scientific workflows focus on and the latencies associated with them, it is not immediately evident that scientific workflows can directly apply to high throughput real time event processing. In this paper we propose a model for extending an established scientific workflow system to incorporate event processing without losing the richness of the programming abstraction. © 2011 IEEE.}, bibtype = {inproceedings}, author = {Herath, C. and Plale, B.}, doi = {10.1109/eScienceW.2011.20}, booktitle = {Proceedings - 7th IEEE International Conference on e-Science Workshops, eScienceW 2011} }
@techreport{ title = {Escience workflows 9 years out: Converging on a vision}, type = {techreport}, year = {2011}, websites = {https://pdfs.semanticscholar.org/baf6/50d252f1014bc4b86f6f2283f308bbc1b2b2.pdf}, publisher = {Pervasive Technology Institute, Indiana University, Bloomington}, id = {db8cbc1d-f17d-33b9-a3b9-0643d0d90d6a}, created = {2018-03-05T18:20:27.774Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.380Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2011a}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth and Fox, Geoffrey and Kowalczyk, Stacy and Chandrasekar, Kavitha} }
@inproceedings{ title = {Hybrid programming abstraction for e-science workflows and event processing}, type = {inproceedings}, year = {2011}, id = {836d3c8d-9ae4-37ea-b03f-e8be81fe7f29}, created = {2018-03-05T18:20:27.802Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.185Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Herath2011}, private_publication = {false}, abstract = {The scientific event processing for research and development adds extra dimension to the complexities associated with scientific computing. In achieving this, experience gained building scientific computing infrastructures that could span and scale into super computing resources can be reused in significant ways. But there are many unresolved issues relating to managing event streams in such an environment that require attention. Over the years the volumes of events generated in scientific disciplines have steadily grown. The limiting factor of many of these systems have become the time and attention of scientist and with expertise to derive insight out of the high volumes of events generated by the sensors. In this tutorial we propose to share the motivating use cases, research issues and outcomes and tools and frameworks used for event processing in science gateways in conjunction with complex event processing. We would provide hands on experience to the programming model, framework and tools that had evolved as a result of research over the years and relate how the scientific workflow based programming paradigm can provide a cleaner abstraction for query based Complex event processing systems. © 2011 Authors.}, bibtype = {inproceedings}, author = {Herath, C. and Plale, B.}, doi = {10.1145/2002259.2002311}, booktitle = {DEBS'11 - Proceedings of the 5th ACM International Conference on Distributed Event-Based Systems} }
@techreport{ title = {Evaluation of Two XML Storage Approaches for Scientific Metadata}, type = {techreport}, year = {2011}, source = {Indiana University Dept of Computer Science Tech Report}, volume = {698}, id = {d332ce4a-0b49-38bb-90c2-606394010362}, created = {2018-03-05T18:20:27.841Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.412Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jensen2011a}, source_type = {JOUR}, private_publication = {false}, bibtype = {techreport}, author = {Jensen, Scott and Ghoshal, Devarshi and Plale, Beth} }
@article{ title = {Workflow Evolution: Tracing Workflows Through Time}, type = {article}, year = {2011}, id = {2a6ef768-180c-31f9-b203-a7f3755a9460}, created = {2018-03-05T18:20:28.380Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.944Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chinthaka2011}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Chinthaka, Eran and Barga, Roger and Plale, Beth and Araujo, Nelson}, journal = {Microsoft Research} }
@article{ title = {Sigiri: Uniform abstraction for large-scale compute resource interactions}, type = {article}, year = {2011}, id = {9e2d6c2a-87b9-3f8b-bfd3-b9cf4ad6b5b4}, created = {2018-03-05T18:20:28.568Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:44.712Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Withana2011}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Withana, Eran Chinthaka and Plale, Beth}, journal = {School of Informatics and Computing, Indiana University, Bloomington, Indiana, Tech. Rep. TR693} }
@inproceedings{ title = {A composite project effort estimation approach in an enterprise software development project}, type = {inproceedings}, year = {2011}, pages = {331-334}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84855554321&partnerID=40&md5=1c81dd239776315921fc2d45a520b27b}, id = {348c14a4-0299-311b-b699-53c051fbf5b4}, created = {2018-03-05T18:20:29.167Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.620Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Catal2011331}, source_type = {conference}, notes = {cited By 2; Conference of SEKE 2011 - Proceedings of the 23rd International Conference on Software Engineering and Knowledge Engineering ; Conference Date: 7 July 2011 Through 9 July 2011; Conference Code:88007}, private_publication = {false}, abstract = {Software effort estimation research has been ongoing for almost 40 yean. Over the yens, several classes of effort estimation techniques have been introduced. Some of these techniques include model-based, expertise-based, learning-oriented, regression-based, and dynamics-based effort estimations. However, none of these techniques is best for all situations. In this stud), we propose a composite technique to estimate the development efforts in a recent enterprise software development project. This paper shows how we specif}, bibtype = {inproceedings}, author = {Catal, C and Aktas, M S}, booktitle = {SEKE 2011 - Proceedings of the 23rd International Conference on Software Engineering and Knowledge Engineering} }
@techreport{ title = {Strengths and weaknesses of sub-workflow interoperability}, type = {techreport}, year = {2011}, websites = {https://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR699}, publisher = {Tech. Rep. TR700}, id = {c3f664fb-e24b-39c5-91d7-5a8093c2d7bf}, created = {2018-03-05T18:20:29.400Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.632Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2011}, source_type = {RPRT}, private_publication = {false}, bibtype = {techreport}, author = {Plale, Beth and Withana, Eran Chinthaka and Herath, Chathura and Chandrasekar, Kavitha and Luo, Yuan and Terkhorn, Felix} }
@inproceedings{ title = {BlogMiner: Web blog mining application for classification of movie reviews}, type = {inproceedings}, year = {2010}, keywords = {Blog mining; Discussion forum; Opinion mining; Pot,Blogs; Search engines; World Wide Web,Internet}, pages = {77-84}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-77954507682&doi=10.1109%2FICIW.2010.19&partnerID=40&md5=cd87a983f74ad1f425025aed64dec9fb}, city = {Barcelona}, id = {bc4b06c5-f926-3f72-88ef-dcf1fffdbc95}, created = {2018-03-05T18:20:21.148Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.276Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Baloglu201077}, source_type = {conference}, notes = {cited By 4; Conference of 5th International Conference on Internet and Web Applications and Services, ICIW 2010 ; Conference Date: 9 May 2010 Through 15 May 2010; Conference Code:81061}, private_publication = {false}, abstract = {With the increasing use of Web 2.0 platforms such as Web Blogs, discussion forums, Wikis, and various other types of social media, people began to share their experiences and opinions about products or services on the World Wide Web. Web Blogs have thus become an important source of information. In turn, great interest in blog mining has arisen, specifically due to its potential applications, such as in opinion or review search engine applications the ability to collect and analyze data. In this study, we introduce an architecture, implementation, and evaluation of a Web blog mining application, called the BlogMiner, which extracts and classifies people's opinions and emotions (or sentiment) from the contents of weblogs about movie reviews. © 2010 IEEE.}, bibtype = {inproceedings}, author = {Baloglu, A and Aktas, M S}, doi = {10.1109/ICIW.2010.19}, booktitle = {5th International Conference on Internet and Web Applications and Services, ICIW 2010} }
@article{ title = {High-performance hybrid information service architecture}, type = {article}, year = {2010}, pages = {2095-2123}, volume = {22}, publisher = {John Wiley & Sons, Ltd.}, id = {145f45ed-939d-3cee-8ad0-7da0af86db59}, created = {2018-03-05T18:20:21.499Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.063Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {aktas2010high}, source_type = {article}, private_publication = {false}, bibtype = {article}, author = {Aktas, Mehmet S and Pierce, Marlon}, journal = {Concurrency and Computation: Practice and Experience}, number = {15} }
@inproceedings{ title = {Application of Management Frameworks: A Case Study on Managing Workflow related Systems}, type = {inproceedings}, year = {2010}, pages = {519-526}, publisher = {IEEE}, id = {9acbbcfd-14fe-3da1-853a-f9cf32e701eb}, created = {2018-03-05T18:20:21.530Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:45.414Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Perera2010}, source_type = {JOUR}, private_publication = {false}, abstract = {Management architectures are well discussed in the literature, but their application in real life settings has not been as well covered. Automatic management of a system involves many more complexities than closing the controlloop by reacting to sensor data and executing corrective actions. In this paper, we discuss those complexities and propose solutions to those problems on top of Hasthi management framework, where Hasthi is a robust, scalable, and distributed management framework that enables users to manage a system by enforcing management logic authored by users themselves. Furthermore, we present in detail a real life case study, which uses Hasthi to manage a large, SOA based, E-Science Cyberinfrastructure. © 2009 IEEE.}, bibtype = {inproceedings}, author = {Perera, Srinath and Marru, Suresh and Gunarathne, Thilina and Gannon, Dennis and Plale, Beth}, doi = {10.1109/ICWS.2009.52}, booktitle = {Web Services, 2009. ICWS 2009. IEEE International Conference on} }
@article{ title = {Mobile web service architecture using context-store}, type = {article}, year = {2010}, pages = {836-858}, volume = {4}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-78449246434&doi=10.3837%2Ftiis.2010.10.008&partnerID=40&md5=3ba739b118670b16578b0515a1929a02}, id = {53ddbb67-7b5d-38b0-a155-1f15b3a45f82}, created = {2018-03-05T18:20:21.691Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.025Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Oh2010836}, source_type = {article}, notes = {cited By 1}, private_publication = {false}, abstract = {Web Services allow a user to integrate applications from different platforms and languages. Since mobile applications often run on heterogeneous platforms and conditions, Web Service becomes a popular solution for integrating with server applications. However, because of its verbosity, XML based SOAP messaging gives the possible overhead to the less powerful mobile devices. Based on the mobile client's behavior that it usually exchanges messages with Web Service continuously in a session, we design the Handheld Flexible Representation architecture. Our proposed architecture consists of three main components: optimizing message representation by using a data format language (Simple_DFDL), streaming communication channel to reduce latency and the Context-store to store context information of a session as well as redundant parts of the messages. In this paper, we focus on the Context-store and describe the architecture with the Context-store for improving the performance of mobile Web Service messaging. We verify our approach by conducting various evaluations and investigate the performance and scalability of the proposed architecture. The empirical results show that we save 40% of transit time between a client and a service by reducing the message size. In contrast to solutions for a single problem such as the compression or binarization, our architecture addresses the problem at a system level. Thus, by using the Context-store, we expect reliable recovery from the fault condition and enhancing interoperability as well as improving the messaging performance. Copyright © 2010 KSII.}, bibtype = {article}, author = {Oh, S and Aktas, M and Fox, G C}, doi = {10.3837/tiis.2010.10.008}, journal = {KSII Transactions on Internet and Information Systems}, number = {5} }
@inproceedings{ title = {Usage patterns to provision for scientific experimentation in clouds}, type = {inproceedings}, year = {2010}, id = {85edcd12-33e0-329d-baed-2b556365ad42}, created = {2018-03-05T18:20:22.246Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.625Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Withana2010}, private_publication = {false}, abstract = {Driven by the need to provision resources on demand, scientists are turning to commercial and research test-bed Cloud computing resources to run their scientific experiments. Job scheduling on cloud computing resources, unlike earlier platforms, is a balance between throughput and cost of executions. Within this context, we posit that usage patterns can improve the job execution, because these patterns allow a system to plan, stage and optimize scheduling decisions. This paper introduces a novel approach to utilization of user patterns drawn from knowledge-based techniques, to improve execution across a series of active workflows and jobs in cloud computing environments. Using empirical analysis we establish the accuracy of our prediction approach for two different workloads and demonstrate how this knowledge can be used to improve job executions. © 2010 IEEE.}, bibtype = {inproceedings}, author = {Withana, E.C. and Plale, B.}, doi = {10.1109/CloudCom.2010.8}, booktitle = {Proceedings - 2nd IEEE International Conference on Cloud Computing Technology and Science, CloudCom 2010} }
@inbook{ type = {inbook}, year = {2010}, keywords = {Architectural design,Grid information services,Hybrid architectures,I,Inf,Information management,Information retrieval}, pages = {66-99}, volume = {7}, issue = {1}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-76449097521&doi=10.4018%2Fjwsr.2010010104&partnerID=40&md5=9784650898d23327533a5074a211165e}, publisher = {IGI Global}, id = {ea26bee4-f908-3503-b941-e8bae3e9a4a5}, created = {2018-03-05T18:20:22.936Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:45.613Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Aktas201066}, source_type = {incollection}, notes = {<b>From Duplicate 1 (<i>A federated approach to information management in grids</i> - Aktas, M S; Fox, G C; Pierce, M)<br/></b><br/>cited By 0}, private_publication = {false}, abstract = {We propose a novel approach to managing information in grids. The proposed approach is an add-on information system that provides unification and federation of grid information services. The system interacts with local information services and assembles their metadata instances under one hybrid architecture to provide a common query/publish interface to different kinds of metadata. The system also supports interoperability of major grid information services by providing federated information management. We present the semantics and architectural design for this system. We introduce a prototype implementation and present its evaluation. As the results indicate, the proposed system achieves unification and federation of custom implementations of grid information services with negligible processing overheads. Copyright © 2010, IGI Global.}, bibtype = {inbook}, author = {Aktas, Mehmet S and Fox, Geoffrey C and Pierce, Marlon}, doi = {10.4018/jwsr.2010010104}, chapter = {A Federated Approach to Information Management in Grids}, title = {Web Service Composition and New Frameworks in Designing Semantics: Innovations} }
@inproceedings{ title = {What is cyberinfrastructure}, type = {inproceedings}, year = {2010}, pages = {37}, websites = {http://dx.doi.org/10.1145/1878335.1878347,http://portal.acm.org/citation.cfm?doid=1878335.1878347}, publisher = {ACM Press}, city = {New York, New York, USA}, id = {1be0c2e9-212b-30cd-a2a9-8d323985bf44}, created = {2018-03-05T18:20:22.947Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-09-09T18:06:45.617Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Stewart2010}, source_type = {inproceedings}, private_publication = {false}, abstract = {Cyberinfrastructure is a word commonly used but lacking a single, precise definition. One recognizes intuitively the analogy with infrastructure, and the use of cyber to refer to thinking or computing - but what exactly is cyberinfrastructure as opposed to information technology infrastructure? Indiana University has developed one of the more widely cited definitions of cyberinfrastructure: Cyberinfrastructure consists of computing systems, data storage systems, advanced instruments and data repositories, visualization environments, and people, all linked together by software and high performance networks to improve research productivity and enable breakthroughs not otherwise possible. A second definition, more inclusive of scholarship generally and educational activities, has also been published and is useful in describing cyberinfrastructure: Cyberinfrastructure consists of computational systems, data and information management, advanced instruments, visualization environments, and people, all linked together by software and advanced networks to improve scholarly productivity and enable knowledge breakthroughs and discoveries not otherwise possible. In this paper, we describe the origin of the term cyberinfrastructure based on the history of the root word infrastructure, discuss several terms related to cyberinfrastructure, and provide several examples of cyberinfrastructure. © 2010 ACM.}, bibtype = {inproceedings}, author = {Stewart, Craig A and Simms, Stephen and Plale, Beth and Link, Matthew and Hancock, David Y. and Fox, Geoffrey C.}, doi = {10.1145/1878335.1878347}, booktitle = {Proceedings of the 38th annual fall conference on SIGUCCS - SIGUCCS '10} }
@inproceedings{ title = {WORKEM: Representing and emulating distributed scientific workflow execution state}, type = {inproceedings}, year = {2010}, id = {82128d7a-330c-3c1e-a386-549ce7fdefa3}, created = {2018-03-05T18:20:23.379Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.680Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ramakrishnan2010}, private_publication = {false}, abstract = {Scientific workflows have become an integral part of cyberinfrastructure as their computational complexity and data sizes have grown. However, the complexity of the distributed infrastructure makes design of new workflows, determining the right management policies, debugging, testing or reproduction of errors challenging. Today, workflow engines manage the dependencies between tasks of workflows and there are tools available to wrap scientific codes. There is a need for a customizable, isolated and manageable testing container for design, evaluation and deployment of distributed workflows. To build such an environment, we need to be able to model and represent, capture and possibly reuse the execution flows within each task of a workflow that accurately captures the execution behavior. In this paper, we present the design and implementation of WORKEM, an extensible framework that can be used to represent and emulate workflow execution state. We also detail the use of the framework in two specific case studies (a) design and testing of an orchestration system (b) generation of a provenance database. Our evaluation shows that the framework has minimal overheads and can be scaled to run hundreds of workflows in short durations of time and with a high amount of parallelism. © 2010 IEEE.}, bibtype = {inproceedings}, author = {Ramakrishnan, L. and Gannon, D. and Plale, B.}, doi = {10.1109/CCGRID.2010.89}, booktitle = {CCGrid 2010 - 10th IEEE/ACM International Conference on Cluster, Cloud, and Grid Computing} }
@article{ title = {Science on the TeraGrid}, type = {article}, year = {2010}, pages = {81-97}, id = {f88ade3d-4f15-38d3-9688-d29d87344dc3}, created = {2018-03-05T18:20:24.323Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:44.488Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Katz2010}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Katz, Daniel S and Callaghan, Scott and Harkness, Robert and Jha, Shantenu and Kurowski, Krzysztof and Manos, Steven and Pamidighantam, Sudhakar and Pierce, Marlon and Plale, Beth and Song, Carol}, journal = {Computational Methods in Science and Technology, Special}, number = {2010} }
@inproceedings{ title = {The LEAD gateway II: a hardened, persistent community resource for meteorological research and education}, type = {inproceedings}, year = {2010}, id = {1fc82a38-3b13-3ab3-ab6a-d8576beb2a66}, created = {2018-03-05T18:20:25.218Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.858Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2010}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, Beth and Droegemeier, K K and Mattocks, C}, booktitle = {26th conference on interactive information and processing systems} }
@article{ title = {Karma2: Provenance management for data-driven workflows}, type = {article}, year = {2010}, volume = {317}, publisher = {IGI Global}, id = {56e34419-4bd3-312d-bd90-0bc1b143e62d}, created = {2018-03-05T18:20:26.248Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.249Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Simmhan2010}, source_type = {JOUR}, private_publication = {false}, bibtype = {article}, author = {Simmhan, Yogesh L and Plale, Beth and Gannon, Dennis}, journal = {Web Services Research for Emerging Applications: Discoveries and Trends: Discoveries and Trends} }
@inproceedings{ title = {Trading consistency for scalability in scientific metadata}, type = {inproceedings}, year = {2010}, id = {215c3921-75ee-3221-83b7-3a03c14ee8e7}, created = {2018-03-05T18:20:26.983Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.381Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jensen2010}, private_publication = {false}, abstract = {Long-term repositories that are able to represent the detailed descriptive metadata of scientific data have been recognized as key to both data reuse and preservation of the initial investment in generating the data. Detailed metadata captured during scientific investigation not only enables the efficient discovery of relevant data sets but also is a source for exploring ongoing activity. In XMC Cat metadata catalog, an XML catalog that uses a novel hybrid model to store XML to a relational database, we exploit differences in the temporal utility between browse and search metadata to selectively relax the consistency model used. By ensuring only eventual consistency on parts of the solution, we determine through experimental analysis that the performance and scalability of the catalog can be substantially improved. © 2010 IEEE.}, bibtype = {inproceedings}, author = {Jensen, S. and Plale, B.}, doi = {10.1109/eScience.2010.28}, booktitle = {Proceedings - 2010 6th IEEE International Conference on e-Science, eScience 2010} }
@inproceedings{ title = {A multi-dimensional classification model for scientific workflow characteristics}, type = {inproceedings}, year = {2010}, id = {bba58d11-bbd6-32e5-9926-a8b7f9482ccb}, created = {2018-03-05T18:20:27.296Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.319Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ramakrishnan2010a}, private_publication = {false}, abstract = {Workflows have been used to model repeatable tasks or operations in manufacturing, business process, and software. In recent years, workflows are increasingly used for orchestration of science discovery tasks that use distributed resources and web services environments through resource models such as grid and cloud computing. Workflows have disparate requirements and constraints that affects how they might be managed in distributed environments. In this paper, we present a multi-dimensional classification model illustrated by workflow examples obtained through a survey of scientists from different domains including bioinformatics and biomed-ical, weather and ocean modeling, astronomy detailing their data and computational requirements. The survey results and classification model contribute to the high level understanding of scientific workflows. © 2010 ACM.}, bibtype = {inproceedings}, author = {Ramakrishnan, L. and Plale, B.}, doi = {10.1145/1833398.1833402}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data} }
@inproceedings{ title = {Towards proxy workflow execution in environmental research: Application to vortex2}, type = {inproceedings}, year = {2010}, id = {050e78c3-d7a0-3ee5-bc11-c2b067028863}, created = {2018-03-05T18:20:28.112Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:45.944Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2010a}, source_type = {CONF}, private_publication = {false}, bibtype = {inproceedings}, author = {Plale, Beth and Herath, Chathura and Withana, Eran Chinthaka}, booktitle = {Environmental Research Workshop} }
@inproceedings{ title = {Versioning for workflow evolution}, type = {inproceedings}, year = {2010}, id = {d5b0aa50-1a4a-3fc6-b8c7-81496598d096}, created = {2018-03-05T18:20:28.385Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.088Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Withana2010a}, private_publication = {false}, abstract = {Scientists working in eScience environments often use workflows to carry out their computations. Since the workflows evolve as the research itself evolves, these workflows can be a tool for tracking the evolution of the research. Scientists can trace their research and associated results through time or even go back in time to a previous stage and fork to a new branch of research. In this paper we introduce the workflow evolution framework (EVF), which is demonstrated through implementation in the Trident workflow workbench. The primary contribution of the EVF is efficient management of knowledge associated with workflow evolution. Since we believe evolution can be used for workflow attribution, our framework will motivate researchers to share their workflows and get the credit for their contributions. Copyright 2010 ACM.}, bibtype = {inproceedings}, author = {Withana, E.C. and Plale, B. and Barga, R. and Araujo, N.}, doi = {10.1145/1851476.1851586}, booktitle = {HPDC 2010 - Proceedings of the 19th ACM International Symposium on High Performance Distributed Computing} }
@inproceedings{ title = {Streamflow - Programming model for data streaming in scientific workflows}, type = {inproceedings}, year = {2010}, id = {fd137309-7281-36eb-9cd2-93be0b03d51b}, created = {2018-03-05T18:20:28.964Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.402Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Herath2010}, private_publication = {false}, abstract = {Geo-sciences involve large-scale parallel models, high resolution real time data from highly asynchronous and heterogeneous sensor networks and instruments, and complex analysis and visualization tools. Scientific workflows are an accepted approach to executing sequences of tasks on scientists' behalf during scientific investigation. Many geo-science workflows have the need to interact with sensors that produce large continuous streams of data, but programming models provided by scientific workflows are not equipped to handle continuous data streams. This paper proposes a framework that utilizes scientific workflow infrastructure and the benefits of complex event processing to compensate for the impedance mismatch between scientific workflows and continuous data streams. Further we propose and formalize new workflow semantics that would allow the users to not only incorporate stream in scientific workflow, but also make use of the functionalities provided by the complex event processing systems effective within the scientific workflows. © 2010 IEEE.}, bibtype = {inproceedings}, author = {Herath, C. and Plale, B.}, doi = {10.1109/CCGRID.2010.116}, booktitle = {CCGrid 2010 - 10th IEEE/ACM International Conference on Cluster, Cloud, and Grid Computing} }
@book{ title = {Principles and experiences: Designing and building enterprise information systems}, type = {book}, year = {2009}, source = {Always-On Enterprise Information Systems for Business Continuance: Technologies for Reliable and Scalable Operations}, pages = {58-77}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84900261468&doi=10.4018%2F978-1-60566-723-2.ch004&partnerID=40&md5=11b1f906c72ee836d09047794a75919c}, publisher = {IGI Global}, id = {d1a8a7c7-c67b-32c0-ac81-52b2bd329002}, created = {2018-03-05T18:20:21.464Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.433Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Aktas200958}, source_type = {book}, notes = {cited By 0}, private_publication = {false}, abstract = {The data requirements of e-business applications have been increased over the years. These applications present an environment for acquiring, processing, and sharing data among interested parties. To manage information in such data-intensive application domain, independent enterprise e-business applications have developed their own solutions to information services. However, these solutions are not interoperable with each other, target vastly different systems, and address diverse sets of requirements. They require greater interoperability to enable communication between different systems, so that they can share and utilize each other's resources. To address these challenges, we discuss principles and experiences for designing and building of a novel enterprise information system. We introduce a novel architecture for a hybrid information service, which provides unification, federation, and interoperability of major Web-based information services. The hybrid information service is designed as an add-on information system, which interacts with the local information services and assembles their metadata instances under one hybrid architecture. It integrates different information services using unification and federation concepts. In this chapter, we summarize the principles and experiences gained in designing and building the semantics, architecture, and implementation for the hybrid information service. © 2010, IGI Global.}, bibtype = {book}, author = {Aktas, M S}, doi = {10.4018/978-1-60566-723-2.ch004} }
@inproceedings{ title = {Application of management frameworks to manage workflow-based systems: A case study on a large scale e-science project}, type = {inproceedings}, year = {2009}, pages = {519-526}, publisher = {IEEE}, id = {9a5a3360-4fe1-3283-8a24-447dc310cdfa}, created = {2018-03-05T18:20:22.087Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.258Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Perera2009}, source_type = {CONF}, private_publication = {false}, abstract = {Management architectures are well discussed in the literature, but their application in real life settings has not been as well covered. Automatic management of a system involves many more complexities than closing the controlloop by reacting to sensor data and executing corrective actions. In this paper, we discuss those complexities and propose solutions to those problems on top of Hasthi management framework, where Hasthi is a robust, scalable, and distributed management framework that enables users to manage a system by enforcing management logic authored by users themselves. Furthermore, we present in detail a real life case study, which uses Hasthi to manage a large, SOA based, E-Science Cyberinfrastructure.}, bibtype = {inproceedings}, author = {Perera, Srinath and Marru, Suresh and Gunarathne, Thilina and Gannon, Dennis and Plale, Beth}, doi = {10.1109/ICWS.2009.52}, booktitle = {Web Services, 2009. ICWS 2009. IEEE International Conference on} }
@book{ title = {Karma2: Provenance management for data-driven workflows}, type = {book}, year = {2009}, source = {Quantitative Quality of Service for Grid Computing: Applications for Heterogeneity, Large-Scale Distribution, and Dynamic Environments}, id = {1b6810d8-4cb1-323c-91c5-9ed9af37e1af}, created = {2018-03-05T18:20:22.254Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.057Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Simmhan2009}, private_publication = {false}, abstract = {The increasing ability for the sciences to sense the world around us is resulting in a growing need for datadriven e-Science applications that are under the control of workflows composed of services on the Grid. The focus of our work is on provenance collection for these workflows that are necessary to validate the workflow and to determine quality of generated data products. The challenge we address is to record uniform and usable provenance metadata that meets the domain needs while minimizing the modification burden on the service authors and the performance overhead on the workflow engine and the services. The framework is based on generating discrete provenance activities during the lifecycle of a workflow execution that can be aggregated to form complex data and process provenance graphs that can span across workflows. The implementation uses a loosely coupled publish-subscribe architecture for propagating these activities, and the capabilities of the system satisfy the needs of detailed provenance collection. A performance evaluation of a prototype finds a minimal performance overhead (in the range of 1% for an eight-service workflow using 271 data products). © 2009, IGI Global.}, bibtype = {book}, author = {Simmhan, Y.L. and Plale, B. and Gannon, D.}, doi = {10.4018/978-1-60566-370-8.ch020} }
@inproceedings{ title = {Provenance information model of Karma version 3}, type = {inproceedings}, year = {2009}, issue = {PART 1}, id = {c4bc766e-8f1a-3e7a-bf3a-539f11f36339}, created = {2018-03-05T18:20:23.779Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.752Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cao2009}, private_publication = {false}, abstract = {Provenance that captures e-Science activity has long term value only if the right amount and kind of information is collected. In this paper, we propose a two-layer model for representing provenance information capable of representing both execution information and higher level process details. The information model forms the basis for efficient relational database storage and query, and sets the stage for investigation of the necessary and sufficient information for long-term preservation. © 2009 IEEE.}, bibtype = {inproceedings}, author = {Cao, B. and Plale, B. and Subramanian, G. and Robertson, E. and Simmhan, Y.}, doi = {10.1109/SERVICES-I.2009.54}, booktitle = {SERVICES 2009 - 5th 2009 World Congress on Services} }
@inproceedings{ title = {Towards cyberinfrastructure for multi-scale crop disease early warning systems}, type = {inproceedings}, year = {2009}, id = {b77ef723-0b58-3658-8e8e-0b6a2a825981}, created = {2018-03-05T18:20:23.986Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:52.343Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Baker2009}, private_publication = {false}, abstract = {Access to cyberinfrastructure is critical for regional forecasting of specific economically important crop diseases. We present our team's initial steps to create, implement, and validate a multi-scale, multi-regional crop disease forecasting system funded by the USDA. Rapid synoptic and mesoscale crop disease forecasting, especially for emergency decision-making, requires that the model workflow relies on integration of real-time data services from multiple sources, and is executed over a pool of high performance computing resources. Spatially explicit weather forecast models runs are initiated through the Linked Environments for Atmospheric Discovery (LEAD) portal. Hourly forecast variables are extracted from LEAD workflow outputs and are used as inputs to crop disease forecast models implemented as ArcGIS workflows. Workflows include standard models for potato late blight and leaf spot of peanut as well as newly developed models for Fusarium head blight of barley. Resulting crop specific forecasts will inform farm management strategies with the goals of increasing product quality, limiting expenditures, and reducing the amount of chemical released to the environment. Initial results from the 2008 growing season are highly accurate and support continued development of such systems.}, bibtype = {inproceedings}, author = {Baker, K.M. and Plale, B. and Zaslavsky, I. and Marru, S.}, booktitle = {ASABE - 7th World Congress on Computers in Agriculture and Natural Resources 2009, WCCA 2009} }
@inproceedings{ title = {Semantically annotated provenance in the Life Science Grid}, type = {inproceedings}, year = {2009}, volume = {526}, websites = {http://ceur-ws.org/Vol-526/paper_5.pdf}, id = {cc085a60-bb69-3873-bd7b-612c79832788}, created = {2018-03-05T18:20:24.190Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.720Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cao2009a}, private_publication = {false}, abstract = {Selected semantic annotation on raw provenance data can help bridge the gap between low level provenance events (e.g., service invocations, data creation, message passing) and the high-level view that the user has of his/her investigation (e.g., data retrieval and analysis). In this initial investigation we added semantically annotated provenance to the Life Science Grid, a cyber-infrastructure framework supporting interactive data exploration and automated data analysis tools, through (i) automated data provenance collection and (ii) automated semantic enrichment of the collected provenance metadata. We use a paradigmatic life sciences use case of interactive data exploration to show that semantically annotated provenance can help users recognize the occurrence of specific patterns of investigation from an otherwise low-level sequence of elementary interaction events.}, bibtype = {inproceedings}, author = {Cao, B. and Plale, B. and Subramanian, G. and Missier, P. and Goble, C. and Simmhan, Y.}, booktitle = {SWPM'09 Proceedings of the First International Conference on Semantic Web in Provenance Management} }
@article{ title = {Algorithms and the Grid}, type = {article}, year = {2009}, pages = {115-124}, volume = {12}, publisher = {Springer Berlin/Heidelberg}, id = {3d9bddc5-5980-3763-8cae-9408bd7b3e40}, created = {2018-03-05T18:20:27.807Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.120Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {fox2009algorithms}, source_type = {article}, private_publication = {false}, bibtype = {article}, author = {Fox, Geoffrey C and Aktas, Mehmet S and Aydin, Galip and Gadgil, Harshawardhan and Pallickara, Shrideep and Pierce, Marlon E and Sayar, Ahmet}, journal = {Computing and visualization in science}, number = {3} }
@inproceedings{ title = {CBR based workflow composition assistant}, type = {inproceedings}, year = {2009}, issue = {PART 1}, id = {1657cd22-eb11-3039-92f0-7d5afc07e5be}, created = {2018-03-05T18:20:28.116Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.836Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chinthaka2009}, private_publication = {false}, abstract = {Composing a scientific workflow from scratch may be time-consuming, even if the scientist is fully aware of the semantics, the inputs, and the outputs of the expected workflow. Reusing existing services and parts from already composed workflows can aid in reducing the total workflow composition time. However, matching the semantics and the inputs and outputs of these reusable components manually is not an easy task, especially when there are hundreds of such components available. Even components are annotated with information on the semantics of their inputs and outputs, the complex nature of the semantic languages may make manual component selection even harder. In this paper, we propose a Case-Based Reasoning (CBR) approach to assist composition of workflows based on the characteristics of the inputs and the outputs of the reusable workflow components, facilitating user exploitation of existing services and workflows during workflow composition. The architecture can also be extended to utilize the semantics of the various components improving the precision of the identified reusable components. © 2009 IEEE.}, bibtype = {inproceedings}, author = {Chinthaka, E. and Ekanayake, J. and Leake, D. and Plale, B.}, doi = {10.1109/SERVICES-I.2009.51}, booktitle = {5th 2009 World Congress on Services, SERVICES 2009} }
@article{ title = {Exploiting maximal redundancy to optimize SQL queries}, type = {article}, year = {2009}, pages = {187-220}, volume = {20}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-68349131921&doi=10.1007%2Fs10115-008-0156-0&partnerID=40&md5=5f1f5b970c1c561f3ebf5eb0f56e4aac}, id = {f4bd2035-c069-34e6-925e-a7854cf13092}, created = {2018-03-05T18:20:28.576Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.360Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cao2009187}, source_type = {article}, notes = {cited By 4}, private_publication = {false}, abstract = {Detecting and dealing with redundancy is an ubiquitous problem in query optimization, which manifests itself in many areas of research such as materialized views, multi-query optimization, and query-containment algorithms. In this paper, we focus on the issue of intra-query redundancy, redundancy present within a query. We present a method to detect the maximal redundancy present between a main (outer) query block and a subquery block. We then use the method for query optimization, introducing query plans and a new operator that take full advantage of the redundancy discovered. Our approach can deal with redundancy in a wider spectrum of queries than existing techniques. We show experimental evidence that our approach works under certain conditions, and compares favorably to existing optimization techniques when applicable. © Springer-Verlag London Limited 2008.}, bibtype = {article}, author = {Cao, B and Badia, A}, doi = {10.1007/s10115-008-0156-0}, journal = {Knowledge and Information Systems}, number = {2} }
@article{ title = {XML metadata services}, type = {article}, year = {2008}, pages = {801-823}, volume = {20}, publisher = {John Wiley & Sons, Ltd.}, id = {c42f7b1f-860f-3de8-bb75-bfc676c95ff3}, created = {2018-03-05T18:20:21.913Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:50.798Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {aktas2008xml}, source_type = {article}, private_publication = {false}, bibtype = {article}, author = {Aktas, Mehmet S and Fox, Geoffrey C and Pierce, Marlon and Oh, Sangyoon}, journal = {Concurrency and Computation: Practice and Experience}, number = {7} }
@inproceedings{ title = {Riding the geoscience cyberinfrastructure wave of data: Real time data use in education workshop}, type = {inproceedings}, year = {2008}, keywords = {Cyberinfrastructure,Data,Geology,Geoscience,High school}, pages = {456}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-62749083728&doi=10.1109%2FeScience.2008.68&partnerID=40&md5=baab21e6b29233360530ebe5d984ec00}, city = {Indianapolis, IN}, id = {56f64f0c-6200-3b60-9aa5-2671fb5ff48d}, created = {2018-03-05T18:20:24.736Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.905Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Plale2008456}, source_type = {conference}, notes = {cited By 0; Conference of 4th IEEE International Conference on eScience, eScience 2008 ; Conference Date: 7 December 2008 Through 12 December 2008; Conference Code:75550}, private_publication = {false}, abstract = {This workshop brings together scientists, technologists, and educators in a discussion of how data rich geoscience cyberinfrastructure frameworks can be more effectively deployed in high school and early undergraduate settings. © 2008 IEEE.}, bibtype = {inproceedings}, author = {Plale, B and Cao, B}, doi = {10.1109/eScience.2008.68}, booktitle = {4th IEEE International Conference on eScience, eScience 2008} }
@inbook{ type = {inbook}, year = {2008}, pages = {635-651}, publisher = {Birkhäuser Basel}, id = {3d719cf4-44b9-364c-85bd-1ac417be64a2}, created = {2018-03-05T18:20:24.913Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:51.238Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {pierce2008quakesim}, source_type = {incollection}, private_publication = {false}, bibtype = {inbook}, author = {Pierce, Marlon E and Fox, Geoffrey C and Aktas, Mehmet S and Aydin, Galip and Gadgil, Harshawardhan and Qi, Zhigang and Sayar, Ahmet}, chapter = {The QuakeSim project: Web services for managing geophysical data and applications}, title = {Earthquakes: Simulations, Sources and Tsunamis} }
@article{ title = {An implementation of a query language with generalized quantifiers}, type = {article}, year = {2008}, keywords = {Complex queries,Electric resistance,Generalized quantifiers,Optimize,Query languages,Students,Teaching}, pages = {547-548}, volume = {5231 LNCS}, city = {Barcelona}, id = {98b57923-8e2a-3dee-bdc2-17b0bb28b70d}, created = {2018-03-05T18:20:25.561Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.951Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Badia2008547}, source_type = {article}, notes = {cited By 1; Conference of 27th International Conference on Conceptual Modeling, ER 2008 ; Conference Date: 20 October 2008 Through 24 October 2008; Conference Code:74354}, private_publication = {false}, abstract = {It is well known that SQL's syntax sometimes forces users to write queries in an awkward way. Together with the danger of formulating an incorrect query, complex queries pose a challenge to the optimizer. A well studied example is that of universal quantification [1,2]. As an example, assume two relations: student(sid) and teaches(pid,sid), which denotes that professor pid is a teacher of student sid. Consider the question "find the professors teaching all students." Since SQL does not directly support the quantifier all, most textbooks express this question using two subqueries, NOT EXISTS and NOT IN. © 2008 Springer Berlin Heidelberg.}, bibtype = {article}, author = {Badia, A and Debes, B and Cao, B}, doi = {10.1007/978-3-540-87877-3-54}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Provenance collection in an industry biochemical discovery cyberinfrastructure}, type = {inproceedings}, year = {2008}, keywords = {Data provenance,Data visualization,Information theory,Karma,Life Science Grid,S-OGSA,Semiconductor quantum dots}, pages = {424-425}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-62749124190&doi=10.1109%2FeScience.2008.104&partnerID=40&md5=21152098768e5dd377a8316f09223a9f}, city = {Indianapolis, IN}, id = {d046e0b6-dad9-34b9-80fa-17cefda9a01d}, created = {2018-03-05T18:20:27.146Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:49.651Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cao2008424}, source_type = {conference}, notes = {cited By 1; Conference of 4th IEEE International Conference on eScience, eScience 2008 ; Conference Date: 7 December 2008 Through 12 December 2008; Conference Code:75550}, private_publication = {false}, abstract = {Workflows are an accepted approach for constructing computational scientific experiments. Provenance capture during workflow execution captures the creation history of datasets. This record is essential to the long-term preservation and reuse of the data, and to making determinations of its quality. We are applying provenance collection to the open source Life Science Grid (LSG) using the Karma tool, and extending the information with semantic information using S-OGSA. The project raises interesting challenges in instrumentation, annotation, and visualization of provenance data.}, bibtype = {inproceedings}, author = {Cao, B and Subramanian, G and Doddapaneni, S and Plale, B}, doi = {10.1109/eScience.2008.104}, booktitle = {Fourth IEEE International Conference on eScience, eScience 2008} }
@article{ title = {Building and applying geographical information system Grids}, type = {article}, year = {2008}, pages = {1653-1695}, volume = {20}, publisher = {Wiley Online Library}, id = {1d22b1fc-2b52-3d1a-b23b-b8d4b4cb490f}, created = {2018-03-05T18:20:28.736Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:44.994Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {aydin2008building}, source_type = {article}, private_publication = {false}, bibtype = {article}, author = {Aktas, Mehmet S and Aydin, Galip and Bulut, Hasan and Fox, Geoffrey C and Gadgil, Harshawardhan and Ko, Sunghoon and Pierce, Marlon E and Sayar, Ahmet and Gadgil, Harshawardhan and Aktas, Mehmet S and Fox, Geoffrey C and Ko, Sunghoon and Bulut, Hasan and Pierce, Marlon E and Aydin, Galip and Bulut, Hasan and Fox, Geoffrey C and Gadgil, Harshawardhan and Ko, Sunghoon and Pierce, Marlon E and Sayar, Ahmet}, journal = {Concurrency and Computation: Practice and Experience}, number = {14} }
@article{ title = {VLab: collaborative grid services and portals to support computational material science}, type = {article}, year = {2007}, pages = {1717-1728}, volume = {19}, websites = {http://dsc.soic.indiana.edu/publications/VLAB-GCE2005-Final.pdf}, publisher = {John Wiley & Sons, Ltd.}, id = {0857d39d-fadb-3e17-a939-9bc3cd8e8aba}, created = {2018-03-05T18:20:23.788Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:47.645Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {nacar2007vlab}, source_type = {article}, private_publication = {false}, bibtype = {article}, author = {Nacar, Mehmet A and Aktas, Mehmet S and Pierce, Marlon and Lu, Zhenyu and Erlebacher, Gordon and Kigelman, Dan and Bollig, Evan F and da Silva, Cesar R S and Sowell, Benny and Yuen, David A}, doi = {10.1002/cpe.1199}, journal = {Concurrency and Computation: Practice and Experience}, number = {12} }
@article{ title = {Fault tolerant high performance Information Services for dynamic collections of Grid and Web services}, type = {article}, year = {2007}, pages = {317-337}, volume = {23}, websites = {https://www.sciencedirect.com/science/article/pii/S0167739X0600121X}, publisher = {Elsevier}, id = {00f29612-5e05-3d85-a412-e14f73bcd678}, created = {2018-03-05T18:20:29.109Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.237Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {aktas2007fault}, source_type = {article}, private_publication = {false}, bibtype = {article}, author = {Aktas, Mehmet S and Fox, Geoffrey C and Pierce, Marlon}, doi = {10.1016/j.future.2006.05.009}, journal = {Future Generation Computer Systems}, number = {3} }
@inproceedings{ title = {XML Metadata Services}, type = {inproceedings}, year = {2006}, keywords = {Applied (CO),Architectural design,Capability,Code execution,Computer networks,Informat,Metadata,WS-Context,XML metadata,and integration (UDDI),and integrations,discovery,international confer}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-44949168873&doi=10.1109%2FSKG.2006.113&partnerID=40&md5=edaa0fc3280daf19e64904cd812fa679}, city = {Guilin Guangxi}, id = {e2721318-128f-3af2-a61c-672fee77a030}, created = {2018-03-05T18:20:22.677Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:46.595Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Aktas2006}, source_type = {conference}, notes = {cited By 2; Conference of 2006 2nd International Conference on Semantics Knowledge and Grid, SKG ; Conference Date: 1 November 2006 Through 3 November 2006; Conference Code:72224}, private_publication = {false}, abstract = {As the Service Oriented Architecture (SOA) principles have gained importance, an emerging need has appeared for methodologies to locate desired services that provide access to their capability descriptions. These services must typically be assembled Into short-term service collections that, together with code execution services, are combined into a meta-application to perform a particular task. To address metadata requirements of these problems, we introduce XML Metadata Services to manage both stateless and stateful (transient) metadata. We leverage the two widely used web service standards: Universal Description, Discovery, and Integration (UDDI) and Web Services Context (WS-Context) in our design. We describe our approach and experiences when designing "semantics" for XML Metadata Services. We report results from a prototype of the system that is applied to mobile environment for optimizing Web Service communications.}, bibtype = {inproceedings}, author = {Aktas, M S and Oh, S and Fox, G C and Pierce, M E}, doi = {10.1109/SKG.2006.113}, booktitle = {Second International Conference on Semantics, Knowledge and Grid, SKG 2006} }
@inbook{ type = {inbook}, year = {2006}, pages = {2281-2296}, websites = {https://link.springer.com/article/10.1007/s00024-006-0137-8}, publisher = {Birkhäuser Verlag, Basel}, id = {1305468a-0889-3de4-b623-adcccaded20b}, created = {2018-03-05T18:20:24.705Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:48.481Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {aktas2006iservo}, source_type = {incollection}, private_publication = {false}, abstract = {We describe the goals and initial implementation of the International Solid Earth Virtual Observatory (iSERVO). This system is built using a Web Services approach to Grid computing infrastructure and is accessed via a component-based Web portal user interface. We describe our implementations of services used by this system, including Geographical Information System (GIS)-based data grid services for accessing remote data repositories and job management services for controlling multiple execution steps. iSERVO is an example of a larger trend to build globally scalable scientific computing infrastructures using the Service Oriented Architecture approach. Adoption of this approach raises a number of research challenges in millisecond-latency message systems suitable for internet-enabled scientific applications. We review our research in these areas.}, bibtype = {inbook}, author = {Aktas, Mehmet and Aydin, Galip and Donnellan, Andrea and Fox, Geoffrey and Granat, Robert and Grant, Lisa and Lyzenga, Greg and McLeod, Dennis and Pallickara, Shrideep and Parker, Jay and Pierce, Marlon and Rundle, John and Sayar, Ahmet and Tullis, Terry}, chapter = {iSERVO: Implementing the international solid earth research virtual observatory by integrating computational grid and geographical information web services}, title = {Computational Earthquake Physics: Simulations, Analysis and Infrastructure, Part II} }
@inproceedings{ title = {Information services for dynamically assembled semantic grids}, type = {inproceedings}, year = {2005}, keywords = {Codes (standards),Data reduction,Grid computing,Information se,Semantic grids,Semantic information,Static services}, websites = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-42749107932&doi=10.1109%2FSKG.2005.83&partnerID=40&md5=8cf61b579af398da95919fa9736016a0,https://pdfs.semanticscholar.org/144b/df1cd302ecb2e865d022aef6bf0d76ed5f10.pdf}, city = {Beijing, China}, id = {6a4ab089-8991-3e31-9a6d-3523d74165de}, created = {2018-03-05T18:20:25.846Z}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2018-04-23T17:10:45.758Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Aktas2005}, source_type = {conference}, notes = {cited By 3; Conference of 1st International Conference on Semantics, Knowledge and Grid, SKG 2005 ; Conference Date: 27 November 2005 Through 29 November 2005; Conference Code:69776}, private_publication = {false}, abstract = {Many large semantic systems can be described as Semantic Grids of Semantic Grids with large amounts of relatively static services and associated semantic information combined with multiple dynamic regions (sessions or subgrids) where the semantic information is changing rapidly. We design a hybrid Information Service supporting both the scalability of large amounts of relatively slowly varying data and a high performance rapidly updated Information Service for dynamic regions. We use the two web service standards UDDI and WS-Context in our system. © 2006 IEEE.}, bibtype = {inproceedings}, author = {Aktas, Mehmet S and Fox, Geoffrey C. and Pierce, Marlon E}, doi = {10.1109/SKG.2005.83}, booktitle = {First International Conference on Semantics, Knowledge and Grid, SKG 2005} }
@techreport{ title = {Datasets Published by the IU Pervasive Technology Institute}, type = {techreport}, year = {1999}, keywords = {Technical Report}, websites = {http://creativecommons.org/licenses/by/4.0/.}, month = {8}, day = {26}, id = {ea53ae47-080a-381d-baba-cd85f4c40b09}, created = {2020-09-15T22:43:58.791Z}, accessed = {2020-09-11}, file_attached = {false}, profile_id = {42d295c0-0737-38d6-8b43-508cab6ea85d}, group_id = {9d761a94-2f2d-31ce-a8c3-50aa6d668643}, last_modified = {2020-09-15T22:43:58.791Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This report considers only data sets and binary digital products stored in IU Scholarworks (scholarworks.iu.edu). Software stored in other repositories (such as sourceforge.net or github.com) are not included in this listing. There are a total of 177 data sets listed in this report (see Section 2). There are eight additional binary images published by the IU Pervasive Technology Institute via Scholarworks.iu.edu between 1999 and 2019 (see Section 3). All of these latter eight are binaries of Virtual Machine images used on the Jetstream cloud system (Jetstream-cloud.org)}, bibtype = {techreport}, author = {Stewart, Craig A and Plale, Beth and Fischer, Jeremy} }