<script src="https://bibbase.org/show?bib=https://bibbase.org/network/files/tHMs8ic86gSWoTp44&jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/show?bib=https://bibbase.org/network/files/tHMs8ic86gSWoTp44");
print_r($contents);
?>
<iframe src="https://bibbase.org/show?bib=https://bibbase.org/network/files/tHMs8ic86gSWoTp44"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@inproceedings{missier2024provenance, author = {Missier, Paolo and Torlone, Riccardo}, title = {From why-provenance to why+provenance: Towards addressing deep data explanations in {Data-Centric} {AI}}, booktitle = {Proceedings of the 32nd {Symposium} on {Advanced} {Database} {Systems}}, year = 2024, editor = {Atzori, Maurizio and Ciaccia, Paolo and Ceci, Michelangelo and Mandreoli, Federica}, volume = 3741, pages = {508-517}, month = jun, address = {Villasimius, Sardinia, Italy}, publisher = {CEUR Workshop Proceedings}, url = {https://ceur-ws.org/Vol-3741/paper11.pdf}, }
@inproceedings{gregori_design_2024, address = {Utrecht, NL}, title = {Design and {Development} of a {Provenance} {Capture} {Platform} for {Data} {Science}}, abstract = {As machine learning and AI systems become more prevalent, understanding how their decisions are made is key to maintaining their trust. To solve this problem, it is widely accepted that fundamental support can be provided by the knowledge of how data are altered in the pre-processing phase, using data provenance to track such changes. This paper focuses on the design and development of a system for collecting and managing data provenance of data preparation pipelines in data science. An investigation of publicly available machine learning pipelines is conducted to identify the most important features required for the tool to achieve impact on a broad selection of pre-processing data manipulation. This reveals that the operations that are used in practice can be implemented by combining a rather limited set of basic operators. We then illustrate and test implementation choices aimed at supporting the provenance capture for those operations efficiently and with minimal effort for data scientists.}, booktitle = {Procs. 3rd {DATAPLAT} workshop, co-located with {ICDE} 2024}, publisher = {IEEE}, author = {Gregori, Luca and Missier, Paolo and Stidolph, Matthew and Torlone, riccardo and Wood, Alessandro}, month = may, year = {2024}, url={https://www.dropbox.com/scl/fi/plz8egd5wdvb5bp5vra09/840300a285.pdf?rlkey=gitqo6jzveh915g9fhbsqpqyn&st=8pk9vluh&dl=0} }
@article{schintke_validity_2024, title = {Validity constraints for data analysis workflows}, volume = {157}, issn = {0167-739X}, url = {https://www.sciencedirect.com/science/article/pii/S0167739X24001079}, doi = {https://doi.org/10.1016/j.future.2024.03.037}, abstract = {Porting a scientific data analysis workflow (DAW) to a cluster infrastructure, a new software stack, or even only a new dataset with some notably different properties is often challenging. Despite the structured definition of the steps (tasks) and their interdependencies during a complex data analysis in the DAW specification, relevant assumptions may remain unspecified and implicit. Such hidden assumptions often lead to crashing tasks without a reasonable error message, poor performance in general, non-terminating executions, or silent wrong results of the DAW, to name only a few possible consequences. Searching for the causes of such errors and drawbacks in a distributed compute cluster managed by a complex infrastructure stack, where DAWs for large datasets typically are executed, can be tedious and time-consuming. We propose validity constraints (VCs) as a new concept for DAW languages to alleviate this situation. A VC is a constraint specifying logical conditions that must be fulfilled at certain times for DAW executions to be valid. When defined together with a DAW, VCs help to improve the portability, adaptability, and reusability of DAWs by making implicit assumptions explicit. Once specified, VCs can be controlled automatically by the DAW infrastructure, and violations can lead to meaningful error messages and graceful behavior (e.g., termination or invocation of repair mechanisms). We provide a broad list of possible VCs, classify them along multiple dimensions, and compare them to similar concepts one can find in related fields. We also provide a proof-of-concept implementation for the workflow system Nextflow.}, journal = {Future Generation Computer Systems}, author = {Schintke, Florian and Belhajjame, Khalid and Mecquenem, Ninon De and Frantz, David and Guarino, Vanessa Emanuela and Hilbrich, Marcus and Lehmann, Fabian and Missier, Paolo and Sattler, Rebecca and Sparka, Jan Arne and Speckhard, Daniel T. and Stolte, Hermann and Vu, Anh Duc and Leser, Ulf}, year = {2024}, keywords = {Dependability, Integrity and conformance checking, Scientific workflow systems, Validity constraints, Workflow specification languages}, pages = {82--97}, }
@article {Lewise080678, author = {Jadene Lewis and Felicity Evison and Rominique Doal and Joanne Field and Suzy Gallier and Steve Harris and Peta le Roux and Mohammed Osman and Chris Plummer and Elizabeth Sapey and Mervyn Singer and Avan A Sayer and Miles D Witham}, editor = {, and , and Sayer, Avan A and Bartle, Victoria and Cooper, Rachel and Cordell, Heather J and Holding, Ray and Marshall, Tom and Matthews, Fiona E and Missier, Paolo and Pearson, Ewan and Plummer, Chris and Robinson, Sian and Sapey, Elizabeth and Singer, Mervyn and Scharf, Thomas and Wason, James and Witham, Miles D}, title = {How far back do we need to look to capture diagnoses in electronic health records? A retrospective observational study of hospital electronic health record data}, volume = {14}, number = {2}, elocation-id = {e080678}, year = {2024}, doi = {10.1136/bmjopen-2023-080678}, publisher = {British Medical Journal Publishing Group}, abstract = {Objectives Analysis of routinely collected electronic health data is a key tool for long-term condition research and practice for hospitalised patients. This requires accurate and complete ascertainment of a broad range of diagnoses, something not always recorded on an admission document at a single point in time. This study aimed to ascertain how far back in time electronic hospital records need to be interrogated to capture long-term condition diagnoses.Design Retrospective observational study of routinely collected hospital electronic health record data.Setting Queen Elizabeth Hospital Birmingham (UK)-linked data held by the PIONEER acute care data hub.Participants Patients whose first recorded admission for chronic obstructive pulmonary disease (COPD) exacerbation (n=560) or acute stroke (n=2142) was between January and December 2018 and who had a minimum of 10 years of data prior to the index date.Outcome measures We identified the most common International Classification of Diseases version 10-coded diagnoses received by patients with COPD and acute stroke separately. For each diagnosis, we derived the number of patients with the diagnosis recorded at least once over the full 10-year lookback period, and then compared this with shorter lookback periods from 1 year to 9 years prior to the index admission.Results Seven of the top 10 most common diagnoses in the COPD dataset reached \>90\% completeness by 6 years of lookback. Atrial fibrillation and diabetes were \>90\% coded with 2{\textendash}3 years of lookback, but hypertension and asthma completeness continued to rise all the way out to 10 years of lookback. For stroke, 4 of the top 10 reached 90\% completeness by 5 years of lookback; angina pectoris was \>90\% coded at 7 years and previous transient ischaemic attack completeness continued to rise out to 10 years of lookback.Conclusion A 7-year lookback captures most, but not all, common diagnoses. Lookback duration should be tailored to the conditions being studied.Data may be obtained from a third party and are not publicly available. The data that support the findings of this study are not openly available due to reasons of sensitivity. Data may be accessed on request to the HDR-UK PIONEER acute data hub on provision of permission from the PIONEER Data Trust Committee and provision of a data access agreement. Data are located in controlled access data storage at the PIONEER acute data hub.}, issn = {2044-6055}, URL = {https://bmjopen.bmj.com/content/14/2/e080678}, eprint = {https://bmjopen.bmj.com/content/14/2/e080678.full.pdf}, journal = {BMJ Open} }
@article{mcteer_machine_2024, title = {Machine learning approaches to enhance diagnosis and staging of patients with {MASLD} using routinely available clinical information}, volume = {19}, url = {https://doi.org/10.1371/journal.pone.0299487}, doi = {10.1371/journal.pone.0299487}, abstract = {Aims Metabolic dysfunction Associated Steatotic Liver Disease (MASLD) outcomes such as MASH (metabolic dysfunction associated steatohepatitis), fibrosis and cirrhosis are ordinarily determined by resource-intensive and invasive biopsies. We aim to show that routine clinical tests offer sufficient information to predict these endpoints. Methods Using the LITMUS Metacohort derived from the European NAFLD Registry, the largest MASLD dataset in Europe, we create three combinations of features which vary in degree of procurement including a 19-variable feature set that are attained through a routine clinical appointment or blood test. This data was used to train predictive models using supervised machine learning (ML) algorithm XGBoost, alongside missing imputation technique MICE and class balancing algorithm SMOTE. Shapley Additive exPlanations (SHAP) were added to determine relative importance for each clinical variable. Results Analysing nine biopsy-derived MASLD outcomes of cohort size ranging between 5385 and 6673 subjects, we were able to predict individuals at training set AUCs ranging from 0.719-0.994, including classifying individuals who are At-Risk MASH at an AUC = 0.899. Using two further feature combinations of 26-variables and 35-variables, which included composite scores known to be good indicators for MASLD endpoints and advanced specialist tests, we found predictive performance did not sufficiently improve. We are also able to present local and global explanations for each ML model, offering clinicians interpretability without the expense of worsening predictive performance. Conclusions This study developed a series of ML models of accuracy ranging from 71.9—99.4\% using only easily extractable and readily available information in predicting MASLD outcomes which are usually determined through highly invasive means.}, number = {2}, journal = {PLOS ONE}, author = {McTeer, Matthew and Applegate, Douglas and Mesenbrink, Peter and Ratziu, Vlad and Schattenberg, Jörn M. and Bugianesi, Elisabetta and Geier, Andreas and Romero Gomez, Manuel and Dufour, Jean-Francois and Ekstedt, Mattias and Francque, Sven and Yki-Jarvinen, Hannele and Allison, Michael and Valenti, Luca and Miele, Luca and Pavlides, Michael and Cobbold, Jeremy and Papatheodoridis, Georgios and Holleboom, Adriaan G. and Tiniakos, Dina and Brass, Clifford and Anstee, Quentin M. and Missier, Paolo and investigators, on behalf of the LITMUS Consortium}, month = feb, year = {2024}, note = {Publisher: Public Library of Science}, pages = {1--17}, }
@Article{math12050777, AUTHOR = {McTeer, Matthew and Henderson, Robin and Anstee, Quentin M. and Missier, Paolo}, TITLE = {Handling Overlapping Asymmetric Data Sets—A Twice Penalized P-Spline Approach}, JOURNAL = {Mathematics}, VOLUME = {12}, YEAR = {2024}, NUMBER = {5}, ARTICLE-NUMBER = {777}, URL = {https://www.mdpi.com/2227-7390/12/5/777}, ISSN = {2227-7390}, ABSTRACT = {Aims: Overlapping asymmetric data sets are where a large cohort of observations have a small amount of information recorded, and within this group there exists a smaller cohort which have extensive further information available. Missing imputation is unwise if cohort size differs substantially; therefore, we aim to develop a way of modelling the smaller cohort whilst considering the larger. Methods: Through considering traditionally once penalized P-Spline approximations, we create a second penalty term through observing discrepancies in the marginal value of covariates that exist in both cohorts. Our now twice penalized P-Spline is designed to firstly prevent over/under-fitting of the smaller cohort and secondly to consider the larger cohort. Results: Through a series of data simulations, penalty parameter tunings, and model adaptations, our twice penalized model offers up to a 58% and 46% improvement in model fit upon a continuous and binary response, respectively, against existing B-Spline and once penalized P-Spline methods. Applying our model to an individual’s risk of developing steatohepatitis, we report an over 65% improvement over existing methods. Conclusions: We propose a twice penalized P-Spline method which can vastly improve the model fit of overlapping asymmetric data sets upon a common predictive endpoint, without the need for missing data imputation.}, DOI = {10.3390/math12050777} }
@article{10.1145/3644385, author = {Chapman, Adriane and Lauro, Luca and Missier, Paolo and Torlone, Riccardo}, title = {Supporting Better Insights of Data Science Pipelines with Fine-grained Provenance}, year = {2024}, issue_date = {June 2024}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {49}, number = {2}, issn = {0362-5915}, url = {https://doi.org/10.1145/3644385}, doi = {10.1145/3644385}, abstract = {Successful data-driven science requires complex data engineering pipelines to clean, transform, and alter data in preparation for machine learning, and robust results can only be achieved when each step in the pipeline can be justified, and its effect on the data explained. In this framework, we aim at providing data scientists with facilities to gain an in-depth understanding of how each step in the pipeline affects the data, from the raw input to training sets ready to be used for learning. Starting from an extensible set of data preparation operators commonly used within a data science setting, in this work we present a provenance management infrastructure for generating, storing, and querying very granular accounts of data transformations, at the level of individual elements within datasets whenever possible. Then, from the formal definition of a core set of data science preprocessing operators, we derive a provenance semantics embodied by a collection of templates expressed in PROV, a standard model for data provenance. Using those templates as a reference, our provenance generation algorithm generalises to any operator with observable input/output pairs. We provide a prototype implementation of an application-level provenance capture library to produce, in a semi-automatic way, complete provenance documents that account for the entire pipeline. We report on the ability of that reference implementation to capture provenance in real ML benchmark pipelines and over TCP-DI synthetic data. We finally show how the collected provenance can be used to answer a suite of provenance benchmark queries that underpin some common pipeline inspection questions, as expressed on the Data Science Stack Exchange.}, journal = {ACM Trans. Database Syst.}, month = {apr}, articleno = {6}, numpages = {42}, keywords = {Provenance, data science, data preparation, preprocessing} }
@ARTICLE{Shao2023, author = {Shao, Shuai and Guan, Yu and Zhai, Bing and Missier, Paolo and Plötz, Thomas}, title = {ConvBoost: Boosting ConvNets for Sensor-based Activity Recognition}, year = {2023}, journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies}, volume = {7}, number = {2}, doi = {10.1145/3596234}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85162778689&doi=10.1145%2f3596234&partnerID=40&md5=8538b225ee8f88cc17b38cb86f86fdca}, type = {Article}, publication_stage = {Final}, source = {Scopus}, note = {Cited by: 1; All Open Access, Bronze Open Access, Green Open Access} }
@ARTICLE{Motta2023474, author = {Motta, Federico and Milic, Jovana and Gozzi, Licia and Belli, Michela and Sighinolfi, Laura and Cuomo, Gianluca and Carli, Federica and Dolci, Giovanni and Iadisernia, Vittorio and Burastero, Giulia and Mussini, Cristina and Missier, Paolo and Mandreoli, Federica and Guaraldi, Giovanni}, title = {A Machine Learning Approach to Predict Weight Change in ART-Experienced People Living with HIV}, year = {2023}, journal = {Journal of Acquired Immune Deficiency Syndromes}, volume = {94}, number = {5}, pages = {474 – 481}, doi = {10.1097/QAI.0000000000003302}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85176428031&doi=10.1097%2fQAI.0000000000003302&partnerID=40&md5=08c7bfd07ef1c7cf2321d7e75e16c155}, type = {Article}, publication_stage = {Final}, source = {Scopus}, note = {Cited by: 0} }
@inproceedings{calero-diaz_interpretable_2023, address = {Sorrento, Italy}, title = {Interpretable and robust hospital readmission predictions from {Electronic} {Health} {Records}}, abstract = {—Rates of Hospital Readmission (HR), defined as unplanned readmission within 30 days of discharge, have been increasing over the years, and impose an economic burden on healthcare services worldwide. Despite recent research into predicting HR, few models provide sufficient discriminative ability. Three main drawbacks can be identified in the published literature: (i) imbalance in the target classes (readmitted or not), (ii) not including demographic and lifestyle predictors, and (iii) lack of interpretability of the models. In this work, we address these three points by evaluating class balancing techniques, performing a feature selection process including demographic and lifestyle features, and adding interpretability through a combination of SHapley Additive exPlanations (SHAP) and Accumulated Local Effects (ALE) post hoc methods. Our best classifier for this binary outcome achieves a UAC of 0.849 using a selection of 1296 features, extracted from patients’ Electronic Health Records (EHRs) and from their sociodemographics profiles. Using SHAP and ALE, we have established the importance of age, the number of long-term conditions, and the duration of the first admission as top predictors. In addition, we show through an ablation study that demographic and lifestyle features provide even better predictive capabilities than other features, suggesting their relevance toward HR}, booktitle = {Procs. {IEEE} {BigData}}, publisher = {IEEE}, author = {Calero-Diaz, Hugo and Hamad, Rebeen and Atallah, Christian and Casement, John and Canoy, Dexter and Reynolds, Nick and Barnes, Michael and Missier, Paolo}, month = dec, year = {2023}, url={https://ieeexplore.ieee.org/document/10386820} }
@article{eto_ethnic_2023, title = {Ethnic differences in early onset multimorbidity and associations with health service use, long-term prescribing, years of life lost, and mortality: {A} cross-sectional study using clustering in the {UK} {Clinical} {Practice} {Research} {Datalink}}, volume = {20}, issn = {1549-1676}, shorttitle = {Ethnic differences in early onset multimorbidity and associations with health service use, long-term prescribing, years of life lost, and mortality}, url = {https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.1004300}, doi = {10.1371/journal.pmed.1004300}, abstract = {Background The population prevalence of multimorbidity (the existence of at least 2 or more long-term conditions [LTCs] in an individual) is increasing among young adults, particularly in minority ethnic groups and individuals living in socioeconomically deprived areas. In this study, we applied a data-driven approach to identify clusters of individuals who had an early onset multimorbidity in an ethnically and socioeconomically diverse population. We identified associations between clusters and a range of health outcomes. Methods and findings Using linked primary and secondary care data from the Clinical Practice Research Datalink GOLD (CPRD GOLD), we conducted a cross-sectional study of 837,869 individuals with early onset multimorbidity (aged between 16 and 39 years old when the second LTC was recorded) registered with an English general practice between 2010 and 2020. The study population included 777,906 people of White ethnicity (93\%), 33,915 people of South Asian ethnicity (4\%), and 26,048 people of Black African/Caribbean ethnicity (3\%). A total of 204 LTCs were considered. Latent class analysis stratified by ethnicity identified 4 clusters of multimorbidity in White groups and 3 clusters in South Asian and Black groups. We found that early onset multimorbidity was more common among South Asian (59\%, 33,915) and Black (56\% 26,048) groups compared to the White population (42\%, 777,906). Latent class analysis revealed physical and mental health conditions that were common across all ethnic groups (i.e., hypertension, depression, and painful conditions). However, each ethnic group also presented exclusive LTCs and different sociodemographic profiles: In White groups, the cluster with the highest rates/odds of the outcomes was predominantly male (54\%, 44,150) and more socioeconomically deprived than the cluster with the lowest rates/odds of the outcomes. On the other hand, South Asian and Black groups were more socioeconomically deprived than White groups, with a consistent deprivation gradient across all multimorbidity clusters. At the end of the study, 4\% (34,922) of the White early onset multimorbidity population had died compared to 2\% of the South Asian and Black early onset multimorbidity populations (535 and 570, respectively); however, the latter groups died younger and lost more years of life. The 3 ethnic groups each displayed a cluster of individuals with increased rates of primary care consultations, hospitalisations, long-term prescribing, and odds of mortality. Study limitations include the exclusion of individuals with missing ethnicity information, the age of diagnosis not reflecting the actual age of onset, and the exclusion of people from Mixed, Chinese, and other ethnic groups due to insufficient power to investigate associations between multimorbidity and health-related outcomes in these groups. Conclusions These findings emphasise the need to identify, prevent, and manage multimorbidity early in the life course. Our work provides additional insights into the excess burden of early onset multimorbidity in those from socioeconomically deprived and diverse groups who are disproportionately and more severely affected by multimorbidity and highlights the need to ensure healthcare improvements are equitable.}, language = {en}, number = {10}, urldate = {2023-10-30}, journal = {PLOS Medicine}, author = {Eto, Fabiola and Samuel, Miriam and Henkin, Rafael and Mahesh, Meera and Ahmad, Tahania and Angdembe, Alisha and McAllister-Williams, R. Hamish and Missier, Paolo and Reynolds, Nick J. and Barnes, Michael R. and Hull, Sally and Finer, Sarah and Mathur, Rohini}, month = oct, year = {2023}, note = {Publisher: Public Library of Science}, keywords = {African people, Death rates, Electronic medical records, Ethnic epidemiology, Ethnicities, Long-term care, Primary care, Socioeconomic aspects of health}, pages = {e1004300}, file = {Full Text PDF:/Users/npm65/Zotero/storage/L5NZP6HL/Eto et al. - 2023 - Ethnic differences in early onset multimorbidity a.pdf:application/pdf}, }
@article{10.1145/3617377, author = {Gonz\'{a}lez-Zelaya, Vladimiro and Salas, Juli\'{a}n and Meg\'{\i}as, David and Missier, Paolo}, title = {Fair and Private Data Preprocessing through Microaggregation}, year = {2023}, issue_date = {April 2024}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {18}, number = {3}, issn = {1556-4681}, url = {https://doi.org/10.1145/3617377}, doi = {10.1145/3617377}, abstract = {Privacy protection for personal data and fairness in automated decisions are fundamental requirements for responsible Machine Learning. Both may be enforced through data preprocessing and share a common target: data should remain useful for a task, while becoming uninformative of the sensitive information. The intrinsic connection between privacy and fairness implies that modifications performed to guarantee one of these goals, may have an effect on the other, e.g., hiding a sensitive attribute from a classification algorithm might prevent a biased decision rule having such attribute as a criterion. This work resides at the intersection of algorithmic fairness and privacy. We show how the two goals are compatible, and may be simultaneously achieved, with a small loss in predictive performance. Our results are competitive with both state-of-the-art fairness correcting algorithms and hybrid privacy-fairness methods. Experiments were performed on three widely used benchmark datasets: Adult Income, COMPAS, and German Credit.}, journal = {ACM Trans. Knowl. Discov. Data}, month = {dec}, articleno = {49}, numpages = {24}, keywords = {ethical AI, privacy preserving data mining, algorithmic fairness, Responsible machine learning, fair classification} }
@article{evison_mapping_2023, title = {Mapping inpatient care pathways for patients with {COPD}: an observational study using routinely collected electronic hospital record data.}, volume = {9}, copyright = {Copyright ©The authors 2023.}, issn = {2312-0541}, doi = {10.1183/23120541.00110-2023}, abstract = {INTRODUCTION: Respiratory specialist ward care is associated with better outcomes for patients with COPD exacerbations. We assessed patient pathways and associated factors for people admitted to hospital with COPD exacerbations. METHODS: We analysed routinely collected electronic health data for patients admitted with COPD exacerbation in 2018 to Queen Elizabeth Hospital, Birmingham, UK. We extracted data on demographics, deprivation index, Elixhauser comorbidities, ward moves, length of stay, and in-hospital and 1-year mortality. We compared care pathways with recommended care pathways (transition from initial assessment area to respiratory wards or discharge). We used Markov state transition models to derive probabilities of following recommended pathways for patient subgroups. RESULTS: Of 42 555 patients with unplanned admissions during 2018, 571 patients were admitted at least once with an exacerbation of COPD. The mean±sd age was 51±11 years; 313 (55\%) were women, 337 (59\%) lived in the most deprived neighbourhoods and 45 (9\%) were from non-white ethnic backgrounds. 428 (75.0\%) had ≥4 comorbidities. Age {\textgreater}70 years was associated with higher in-hospital and 1-year mortality, more places of care (wards) and longer length of stay; having ≥4 comorbidities was associated with higher mortality and longer length of stay. Older age was associated with a significantly lower probability of following a recommended pathway ({\textgreater}70 years: 0.514, 95\% CI 0.458-0.571; ≤70 years: 0.636, 95\% CI 0.572-0.696; p=0.004). CONCLUSIONS: Only older age was associated with a lower chance of following recommended hospital pathways of care. Such analyses could help refine appropriate care pathways for patients with COPD exacerbations.}, language = {eng}, number = {5}, journal = {ERJ open research}, author = {Evison, Felicity and Cooper, Rachel and Gallier, Suzy and Missier, Paolo and Sayer, Avan A. and Sapey, Elizabeth and Witham, Miles D.}, month = sep, year = {2023}, pmid = {37850214}, pmcid = {PMC10577591}, note = {Place: England}, pages = {00110--2023}, }
@inproceedings{gonzalez2023preprocessing, title={Preprocessing Matters: Automated Pipeline Selection for Fair Classification}, author={Gonz{\'a}lez-Zelaya, Vladimiro and Salas, Juli{\'a}n and Prangle, Dennis and Missier, Paolo}, booktitle={Modeling Decisions for Artificial Intelligence: 20th International Conference, MDAI 2023, Ume{\aa}, Sweden, June 19--22, 2023, Proceedings}, pages={202--213}, year={2023}, organization={Springer} }
@article{witham_researching_2023, title = {Researching multimorbidity in hospital: can we deliver on the promise of health informatics?}, issn = {1878-7657}, url = {https://doi.org/10.1007/s41999-023-00753-6}, doi = {10.1007/s41999-023-00753-6}, journal = {European Geriatric Medicine}, author = {Witham, Miles D. and Cooper, Rachel and Missier, Paolo and Robinson, Sian M. and Sapey, Elizabeth and Sayer, Avan A.}, month = may, year = {2023}, }
@inproceedings{shao_training_2023, address = {Atlanta, USA}, title = {On {Training} {Strategies} for {LSTMs} in {Sensor}-{Based} {Human} {Activity} {Recognition}}, booktitle = {Procs {PerCom} 2023}, author = {Shao, Shuai and Guan, Yu and Xin, Guan and Missier, Paolo and Ploetz, Thomas}, year = {2023}, }
@inproceedings{kremer_tracking_2022, title = {Tracking trajectories of multiple long-term conditions using dynamic patient-cluster associations}, doi = {10.1109/BigData55660.2022.10021034}, abstract = {Momentum has been growing into research to better understand the dynamics of multiple long-term conditions – multimorbidity (MLTC-M), defined as the co-occurrence of two or more long-term or chronic conditions within an individual. Several research efforts make use of Electronic Health Records (EHR), which represent patients’ medical histories. These range from discovering patterns of multimorbidity, namely by clustering diseases based on their co-occurrence in EHRs, to using EHRs to predict the next disease or other specific outcomes. One problem with the former approach is that it discards important temporal information on the co-occurrence, while the latter requires "big" data volumes that are not always available from routinely collected EHRs, limiting the robustness of the resulting models.In this paper we take an intermediate approach, where initially we use about 143,000 EHRs from UK Biobank to perform time-independent clustering using topic modelling, and Latent Dirichlet Allocation specifically. We then propose a metric to measure how strongly a patient is "attracted" into any given cluster at any point through their medical history. By tracking how such gravitational pull changes over time, we may then be able to narrow the scope for potential interventions and preventative measures to specific clusters, without having to resort to full-fledged predictive modelling.In this preliminary work we show exemplars of these dynamic associations, which suggest that further exploration may lead to actionable insights into patients’ medical trajectories.}, booktitle = {2022 {IEEE} {International} {Conference} on {Big} {Data} ({Big} {Data})}, author = {Kremer, Ron and Raza, Syed Mohib and Eto, Fabiola and Casement, John and Atallah, Christian and Finer, Sarah and Lendrem, Dennis and Barnes, Michael and Reynolds, Nick J and Missier, Paolo}, month = dec, year = {2022}, keywords = {Big Data, electronic health records, Biological system modeling, Time measurement, Predictive models, Data models, Trajectory, MLTC-M, multi-morbidity, Robustness, topic modelling}, pages = {4390--4399}, file = {IEEE Xplore Abstract Record:/Users/npm65/Zotero/storage/XWS3D52X/10021034.html:text/html}, }
@ARTICLE{10.3389/fdata.2022.1021621, AUTHOR={Mandreoli, Federica and Ferrari, Davide and Guidetti, Veronica and Motta, Federico and Missier, Paolo}, TITLE={Real-world data mining meets clinical practice: Research challenges and perspective}, JOURNAL={Frontiers in Big Data}, VOLUME={5}, YEAR={2022}, URL={https://www.frontiersin.org/articles/10.3389/fdata.2022.1021621}, DOI={10.3389/fdata.2022.1021621}, ISSN={2624-909X}, ABSTRACT={As Big Data Analysis meets healthcare applications, domain-specific challenges and opportunities materialize in all aspects of data science. Advanced statistical methods and Artificial Intelligence (AI) on Electronic Health Records (EHRs) are used both for knowledge discovery purposes and clinical decision support. Such techniques enable the emerging Predictive, Preventative, Personalized, and Participatory Medicine (P4M) paradigm. Working with the Infectious Disease Clinic of the University Hospital of Modena, Italy, we have developed a range of Data–Driven (DD) approaches to solve critical clinical applications using statistics, Machine Learning (ML) and Big Data Analytics on real-world EHR. Here, we describe our perspective on the challenges we encountered. Some are connected to medical data and their sparse, scarce, and unbalanced nature. Others are bound to the application environment, as medical AI tools can affect people's health and life. For each of these problems, we report some available techniques to tackle them, present examples drawn from our experience, and propose which approaches, in our opinion, could lead to successful real-world, end-to-end implementations.} }
@article{chapman_dpds_2022, title = {{DPDS}: {Assisting} {Data} {Science} with {Data} {Provenance}}, volume = {15}, url = {https://vldb.org/pvldb/vol15/p3614-torlone.pdf}, doi = {10.14778/3554821.3554857}, abstract = {Successful data-driven science requires a complex combination of data engineering pipelines and data modelling techniques. Robust and defensible results can only be achieved when each step in the pipeline that is designed to clean, transform and alter data in preparation for data modelling can be justified, and its effect on the data explained. The DPDS toolkit presented in this paper is designed to make such justification and explanation process an integral part of data science practice, adding value while remaining as un-intrusive as possible to the analyst. Catering to the broad community of python/pandas data engineers, DPDS implements an observer pattern that is able to capture the fine-grained provenance associated with each individual element of a dataframe, across multiple transformation steps. The resulting provenance graph is stored in Neo4j and queried through a UI, with the goal of helping engineers and analysts to justify and explain their choice of data operations, from raw data to model training, by highlighting the details of the changes through each transformation.}, language = {en}, number = {12}, journal = {PVLDB}, author = {Chapman, Adriane and Missier, Paolo and Lauro, Luca and Torlone, Riccardo}, year = {2022}, pages = {3614 -- 3617}, file = {Chapman et al. - DPDS Assisting Data Science with Data Provenance.pdf:/Users/npm65/Zotero/storage/JKRQ89HE/Chapman et al. - DPDS Assisting Data Science with Data Provenance.pdf:application/pdf}, }
@inproceedings{DBLP:conf/sebd/FerrariMMM22, author = {Davide Ferrari and Federica Mandreoli and Federico Motta and Paolo Missier}, editor = {Giuseppe Amato and Valentina Bartalesi and Devis Bianchini and Claudio Gennaro and Riccardo Torlone}, title = {Data-Driven, AI-Based Clinical Practice: Experiences, Challenges, and Research Directions}, booktitle = {Proceedings of the 30th Italian Symposium on Advanced Database Systems, {SEBD} 2022, Tirrenia (PI), Italy, June 19-22, 2022}, series = {{CEUR} Workshop Proceedings}, volume = {3194}, pages = {392--403}, publisher = {CEUR-WS.org}, year = {2022}, url = {http://ceur-ws.org/Vol-3194/paper47.pdf}, timestamp = {Wed, 24 Aug 2022 09:26:05 +0200}, biburl = {https://dblp.org/rec/conf/sebd/FerrariMMM22.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@article{DBLP:journals/jamia/DarkeCCTMB22, author = {Philip Darke and Sophie Cassidy and Michael Catt and Roy Taylor and Paolo Missier and Jaume Bacardit}, title = {Curating a longitudinal research resource using linked primary care {EHR} data - a {UK} Biobank case study}, journal = {J. Am. Medical Informatics Assoc.}, volume = {29}, number = {3}, pages = {546--552}, year = {2022}, url = {https://doi.org/10.1093/jamia/ocab260}, doi = {10.1093/jamia/ocab260}, timestamp = {Wed, 23 Feb 2022 11:16:49 +0100}, biburl = {https://dblp.org/rec/journals/jamia/DarkeCCTMB22.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@article{guaraldi_interplay_2022, title = {The interplay of post-acute {COVID}-19 syndrome and aging: a biological, clinical and public health approach}, issn = {1568-1637}, url = {https://www.sciencedirect.com/science/article/pii/S1568163722001283}, doi = {https://doi.org/10.1016/j.arr.2022.101686}, abstract = {ABSTRACT The post-acute COVID-19 syndrome (PACS) is characterized by the persistence of fluctuating symptoms over three months from the onset of the possible or confirmed COVID-19 acute phase. Current data suggests that at least 10\% of people with previously documented infection may develop PACS, and up to 50–80\% of prevalence is reported among survivors after hospital discharge. This viewpoint will discuss various aspects of PACS, particularly in older adults, with a specific hypothesis to describe PACS as the expression of a modified aging trajectory induced by SARS CoV-2. This hypothesis will be argued from biological, clinical and public health view, addressing three main questions: (i) does SARS-CoV-2-induced alterations in aging trajectories play a role in PACS?; (ii) do people with PACS face immuno-metabolic derangements that lead to increased susceptibility to age-related diseases?; (iii) is it possible to restore the healthy aging trajectory followed by the individual before pre-COVID?. A particular focus will be given to the well-being of people with PACS that could be assessed by the intrinsic capacity model and support the definition of the healthy aging trajectory.}, journal = {Ageing Research Reviews}, author = {Guaraldi, Giovanni and Milic, Jovana and Cesari, Matteo and Leibovici, Leonard and Mandreoli, Federica and Missier, Paolo and Rozzini, Renzo and Cattelan, Anna Maria and Motta, Federico and Mussini, Cristina and Cossarizza, Andrea}, year = {2022}, pages = {101686}, }
@article{DBLP:journals/jdiq/GeislerVCLGJLMO22, author = {Sandra Geisler and Maria{-}Esther Vidal and Cinzia Cappiello and Bernadette Farias L{\'{o}}scio and Avigdor Gal and Matthias Jarke and Maurizio Lenzerini and Paolo Missier and Boris Otto and Elda Paja and Barbara Pernici and Jakob Rehof}, title = {Knowledge-Driven Data Ecosystems Toward Data Transparency}, journal = {{ACM} J. Data Inf. Qual.}, volume = {14}, number = {1}, pages = {3:1--3:12}, year = {2022}, url = {https://doi.org/10.1145/3467022}, doi = {10.1145/3467022}, timestamp = {Sat, 09 Apr 2022 12:27:16 +0200}, biburl = {https://dblp.org/rec/journals/jdiq/GeislerVCLGJLMO22.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }
@INPROCEEDINGS{9680211, author={Mandreoli, Federica and Motta, Federico and Missier, Paolo}, booktitle={2021 20th IEEE International Conference on Machine Learning and Applications (ICMLA)}, title={An HMM–ensemble approach to predict severity progression of ICU treatment for hospitalized COVID–19 patients}, year={2021}, volume={}, number={}, pages={1299-1306}, doi={10.1109/ICMLA52953.2021.00211}}
@incollection{chapman_right_2021, title = {The {Right} ({Provenance}) {Hammer} for the {Job}: {A} {Comparison} of {Data} {Provenance} {Instrumentation}}, url = {https://doi.org/10.1007/978-3-030-67681-0_3}, booktitle = {Provenance in {Data} {Science}: {From} {Data} {Models} to {Context}-{Aware} {Knowledge} {Graphs}}, author = {Chapman, Adriane and Sasikant, Abhirami and Simonelli, Giulia and Missier, Paolo and Torlone, Riccardo}, year = {2021}, }
@inproceedings{gonzalez-zelaya_optimising_2021, title = {Optimising {Fairness} {Through} {Parametrised} {Data} {Sampling}}, url = {https://doi.org/10.5441/002/edbt.2021.49}, doi = {10.5441/002/edbt.2021.49}, booktitle = {Proceedings of the 24th {International} {Conference} on {Extending} {Database} {Technology}, {EDBT} 2021, {Nicosia}, {Cyprus}, {March} 23 - 26, 2021}, publisher = {OpenProceedings.org}, author = {González-Zelaya, Vladimiro and Salas, Julián and Prangle, Dennis and Missier, Paolo}, editor = {Velegrakis, Yannis and Zeinalipour-Yazti, Demetris and Chrysanthis, Panos K. and Guerra, Francesco}, year = {2021}, pages = {445--450}, }
@article{lam_using_2021, title = {Using wearable activity trackers to predict {Type}-2 {Diabetes}: {A} machine learning-based cross-sectional study of the {UK} {Biobank} accelerometer cohort}, url = {https://preprints.jmir.org/preprint/23364}, doi = {10.2196/23364}, journal = {JMIR Diabetes}, author = {Lam, B and Catt, M and Cassidy, S and Bacardit, J and Darke, P and Butterfield, S and Alshabrawy, O and Trenell, M and Missier, P}, month = jan, year = {2021}, }
@article{chapman_capturing_2021, title = {Capturing and {Querying} {Fine}-grained {Provenance} of {Preprocessing} {Pipelines} in {Data} {Science}}, volume = {14}, url = {http://www.vldb.org/pvldb/vol14/p507-chapman.pdf}, doi = {10.14778/3436905.3436911}, number = {4}, journal = {PVLDB}, author = {Chapman, Adriane and Missier, Paolo and Simonelli, Giulia and Torlone, Riccardo}, month = jan, year = {2021}, pages = {507--520}, }
@article{primo_customisable_2021, title = {A customisable pipeline for the semi-automated discovery of online activists and social campaigns on {Twitter}}, volume = {in press}, abstract = {Substantial research is available on detecting {\textbackslash}textbackslashtextit\{influencers\} on social media platforms. In contrast, comparatively few studies exists on the role of {\textbackslash}textbackslashtextit\{online activists\}, defined informally as users who actively participate in socially-minded online campaigns. Automatically discovering activists who can potentially be approached by organisations that promote social campaigns is important, but not easy, as they are typically active only locally, and, unlike influencers, they are not central to large social media networks. We make the hypothesis that such interesting users can be found on Twitter within temporally and spatially localised {\textbackslash}textbackslashtextit\{contexts\}. We define these as small but topical fragments of the network, containing interactions about social events or campaigns with a significant online footprint. To explore this hypothesis, we have designed an iterative discovery pipeline consisting of two alternating phases of user discovery and context discovery. Multiple iterations of the pipeline result in a growing dataset of user profiles for activists, as well as growing set of online social contexts. This mode of exploration differs significantly from prior techniques that focus on influencers, and presents unique challenges because of the weak online signal available to detect activists. The paper describes the design and implementation of the pipeline as a customisable software framework, where user-defined operational definitions of online activism can be explored. We present an empirical evaluation on two extensive case studies, one concerning healthcare-related campaigns in the UK during 2018, the other related to online activism in Italy during the COVID-19 pandemic.}, journal = {WWW Journal}, author = {Primo, Flavio and Romanovsky, Alexander and de Mello, Rafael and Garcia, Alessandro and Missier, Paolo}, year = {2021}, }
@inproceedings{bajoudah_latency_2021, title = {Latency of {Trading} {Transactions} in {Brokered} {IoT} {Data} {Marketplace} in {Ethereum}}, doi = {10.1109/SWC50871.2021.00043}, booktitle = {2021 {IEEE} {SmartWorld}, {Ubiquitous} {Intelligence} {Computing}, {Advanced} {Trusted} {Computing}, {Scalable} {Computing} {Communications}, {Internet} of {People} and {Smart} {City} {Innovation} ({SmartWorld}/{SCALCOM}/{UIC}/{ATC}/{IOP}/{SCI})}, author = {Bajoudah, Shaimaa and Missier, Paolo}, year = {2021}, pages = {254--263}, }
@article{ferrari_machine_2020, title = {Machine learning in predicting respiratory failure in patients with {COVID}-19 pneumonia?{Challenges}, strengths, and opportunities in a global health emergency}, volume = {15}, url = {https://doi.org/10.1371/journal.pone.0239172}, doi = {10.1371/journal.pone.0239172}, abstract = {Aims The aim of this study was to estimate a 48 hour prediction of moderate to severe respiratory failure, requiring mechanical ventilation, in hospitalized patients with COVID-19 pneumonia. Methods This was an observational prospective study that comprised consecutive patients with COVID-19 pneumonia admitted to hospital from 21 February to 6 April 2020. The patients? medical history, demographic, epidemiologic and clinical data were collected in an electronic patient chart. The dataset was used to train predictive models using an established machine learning framework leveraging a hybrid approach where clinical expertise is applied alongside a data-driven analysis. The study outcome was the onset of moderate to severe respiratory failure defined as PaO2/FiO2 ratio {\textbackslash}textless150 mmHg in at least one of two consecutive arterial blood gas analyses in the following 48 hours. Shapley Additive exPlanations values were used to quantify the positive or negative impact of each variable included in each model on the predicted outcome. Results A total of 198 patients contributed to generate 1068 usable observations which allowed to build 3 predictive models based respectively on 31-variables signs and symptoms, 39-variables laboratory biomarkers and 91-variables as a composition of the two. A fourth ?boosted mixed model? included 20 variables was selected from the model 3, achieved the best predictive performance (AUC = 0.84) without worsening the FN rate. Its clinical performance was applied in a narrative case report as an example. Conclusion This study developed a machine model with 84\% prediction accuracy, which is able to assist clinicians in decision making process and contribute to develop new analytics to improve care at high technology readiness levels.}, number = {11}, journal = {PLOS ONE}, author = {Ferrari, Davide and Milic, Jovana and Tonelli, Roberto and Ghinelli, Francesco and Meschiari, Marianna and Volpi, Sara and Faltoni, Matteo and Franceschi, Giacomo and Iadisernia, Vittorio and Yaacoub, Dina and Ciusa, Giacomo and Bacca, Erica and Rogati, Carlotta and Tutone, Marco and Burastero, Giulia and Raimondi, Alessandro and Menozzi, Marianna and Franceschini, Erica and Cuomo, Gianluca and Corradi, Luca and Orlando, Gabriella and Santoro, Antonella and Digaetano, Margherita and Puzzolante, Cinzia and Carli, Federica and Borghi, Vanni and Bedini, Andrea and Fantini, Riccardo and Tabb�, Luca and Castaniere, Ivana and Busani, Stefano and Clini, Enrico and Girardis, Massimo and Sarti, Mario and Cossarizza, Andrea and Mussini, Cristina and Mandreoli, Federica and Missier, Paolo and Guaraldi, Giovanni}, year = {2020}, pages = {1--14}, }
@inproceedings{ferrari_predicting_2020, address = {Online!}, title = {Predicting respiratory failure in patients with {COVID}-19 pneumonia: a case study from {Northern} {Italy}}, abstract = {The Covid-19 crisis caught health care services around the world by surprise, putting unprecedented pressure on Intensive Care Units (ICU). To help clinical staff to manage the limited ICU capacity, we have developed a Machine Learning model to estimate the probability that a patient admitted to hospital with COVID-19 symptoms would develop severe respiratory failure and require Intensive Care within 48 hours of admission. The model was trained on an initial co-hort of 198 patients admitted to the Infectious Disease ward of Mod-ena University Hospital, in Italy, at the peak of the epidemic, and subsequently refined as more patients were admitted. Using the Light-GBM Decision Tree ensemble approach, we were able to achieve good accuracy (AUC = 0.84) despite a high rate of missing values. Furthermore, we have been able to provide clinicians with explanations in the form of personalised ranked lists of features for each prediction , using only 20 out of more than 90 variables, using Shapley values to describe the importance of each feature.}, booktitle = {The {HELPLINE} workshop, co-located with the 24th {European} {Conference} on {AI} ({ECAI2020})}, publisher = {CEUR-WS}, author = {Ferrari, Davide and Mandreoli, Federica and Guaraldi, Giovanni and Missier, Paolo}, year = {2020}, keywords = {\#covid, \#machine learning}, }
@inproceedings{ferrari_data-driven_2020, address = {Copenhagen, Denmark}, title = {Data-driven vs knowledge-driven inference of health outcomes in the ageing population: a case study}, abstract = {Preventive, Predictive, Personalised and Participative (P4) medicine has the potential to not only vastly improve people's quality of life, but also to significantly reduce healthcare costs and improve its efficiency. Our research focuses on age-related diseases and explores the opportunities offered by a data-driven approach to predict wellness states of ageing individuals, in contrast to the commonly adopted knowledge-driven approach that relies on easy-to-interpret metrics manually introduced by clinical experts. This is done by means of machine learning models applied on the My Smart Age with HIV (MySAwH) dataset, which is collected through a relatively new approach especially for older HIV patient cohorts. This includes Patient Related Outcomes values from mobile smartphone apps and activity traces from commercial-grade activity loggers. Our results show better predictive performance for the data-driven approach. We also show that a \textit{post hoc} interpretation method applied to the predictive models can provide intelligible explanations that enable new forms of personalised and preventive medicine.}, booktitle = {{DARLI} workshop - {Proceedings} of the {Workshops} of the {EDBT}/{ICDT} 2020 {Joint} {Conference}}, publisher = {CEUR-WS}, author = {Ferrari, D and Guaraldi, G and Mandreoli, F and Martoglia, R and Milic, J and Missier, P.}, year = {2020}, keywords = {\#machine learning, \#ageing, \#explainable models}, }
@article{missier_abstracting_2020, title = {Abstracting {PROV} provenance graphs: {A} validity-preserving approach}, volume = {111}, issn = {0167-739X}, doi = {https://doi.org/10.1016/j.future.2020.05.015}, abstract = {Data provenance is a structured form of metadata designed to record the activities and datasets involved in data production, as well as their dependency relationships. The PROV data model, released by the W3C in 2013, defines a schema and constraints that together provide a structural and semantic foundation for provenance. This enables the interoperable exchange of provenance between data producers and consumers. When the provenance content is sensitive and subject to disclosure restrictions, however, a way of hiding parts of the provenance in a principled way before communicating it to certain parties is required. In this paper we present a provenance abstraction operator that achieves this goal. It maps a graphical representation of a PROV document PG1 to a new abstract version PG2, ensuring that (i) PG2 is a valid PROV graph, and (ii) the dependencies that appear in PG2 are justified by those that appear in PG1. These two properties ensure that further abstraction of abstract PROV graphs is possible. A guiding principle of the work is that of minimum damage: the resultant graph is altered as little as possible, while ensuring that the two properties are maintained. The operator developed is implemented as part of a user tool, described in a separate paper, that lets owners of sensitive provenance information control the abstraction by specifying an abstraction policy.}, journal = {Future Generation Computer Systems}, author = {Missier, P. and Bryans, J. and Gamble, C. and Curcin, V.}, year = {2020}, keywords = {Provenance, Provenance abstraction, Provenance metadata}, pages = {352 -- 367}, }
@article{thompson_increasing_2019, title = {Increasing phenotypic annotation improves the diagnostic rate of exome sequencing in a rare neuromuscular disorder}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/humu.23792}, doi = {10.1002/humu.23792}, abstract = {Abstract Phenotype-based filtering and prioritization contribute to the interpretation of genetic variants detected in exome sequencing. However, it is currently unclear how extensive this phenotypic annotation should be. In this study, we compare methods for incorporating phenotype into the interpretation process and assess the extent to which phenotypic annotation aids prioritization of the correct variant. Using a cohort of 29 patients with congenital myasthenic syndromes with causative variants in known or newly discovered disease genes, exome data and the Human Phenotype Ontology (HPO)-coded phenotypic profiles, we show that gene-list filters created from phenotypic annotations perform similarly to curated disease-gene virtual panels. We use Exomiser, a prioritization tool incorporating phenotypic comparisons, to rank candidate variants while varying phenotypic annotation. Analyzing 3,712 combinations, we show that increasing phenotypic annotation improved prioritization of the causative variant, from 62\% ranked first on variant alone to 90\% with seven HPO annotations. We conclude that any HPO-based phenotypic annotation aids variant discovery and that annotation with over five terms is recommended in our context. Although focused on a constrained cohort, this provides real-world validation of the utility of phenotypic annotation for variant prioritization. Further research is needed to extend this concept to other diseases and more diverse cohorts.}, journal = {Human Mutation}, author = {Thompson, Rachel and Papakonstantinou Ntalis, Anastasios and Beltran, Sergi and Tapf, Ana and de Paula Estephan, Eduardo and Polavarapu, Kiran and ’t Hoen, Peter A. C. and Missier, Paolo and Lochmuller, Hanns}, year = {2019}, keywords = {congenital myasthenic syndromes, deep phenotyping, diagnosis, exome sequencing, Exomiser, human phenotype ontology, variant prioritization}, }
@inproceedings{gonzlez_zelaya_parametrised_2019, title = {Parametrised {Data} {Sampling} for {Fairness} {Optimisation}}, booktitle = {Proceedings of {Explainable} {AI} for {Fairness}, {Accountability} \& {Transparency} {Workshop} ({KDD} {XAI})}, publisher = {ACM}, author = {Gonz�lez Zelaya, Carlos Vladimiro and Missier, Paolo and Prangle, Dennis}, year = {2019}, }
@inproceedings{bajoudah_toward_2019, address = {Atlanta, USA}, title = {Toward a {Decentralized}, {Trust}-less {Marketplace} for {Brokered} {IoT} {Data} {Trading} using {Blockchain}}, booktitle = {Procs. 2nd {IEEE} {International} {Conference} on {Blockchain} ({Blockchain} 2019)}, publisher = {IEEE}, author = {Bajoudah, Shaimaa and Changyu, Dong and Missier, Paolo}, year = {2019}, }
@inproceedings{missier_efficient_2019, address = {Milano, Italy}, title = {Efficient {Re}-computation of {Big} {Data} {Analytics} {Processes} in the {Presence} of {Changes}: {Computational} {Framework}, {Reference} {Architecture}, and {Applications}}, booktitle = {Procs. {IEEE} {Big} {Data} {Congress}}, publisher = {IEEE}, author = {Missier, Paolo and Cala, Jacek}, year = {2019}, keywords = {\#provenance, \#re-computation, \#workflow}, }
@inproceedings{primo_customisable_2019, address = {Daedjeon, Korea}, title = {A customisable pipeline for continuously harvesting socially-minded {Twitter} users}, url = {https://arxiv.org/abs/1903.07061}, abstract = {On social media platforms and Twitter in particular, specific classes of users such as influencers have been given satisfactory operational definitions in terms of network and content metrics. Others, for instance online activists, are not less important but their characterisation still requires experimenting. We make the hypothesis that such interesting users can be found within temporally and spatially localised contexts, i.e., small but topical fragments of the network containing interactions about social events or campaigns with a significant footprint on Twitter. To explore this hypothesis, we have designed a continuous user profile discovery pipeline that produces an ever-growing dataset of user profiles by harvesting and analysing contexts from the Twitter stream. The profiles dataset includes key network and content-based users metrics, enabling experimentation with user-defined score functions that characterise specific classes of online users. The paper describes the design and implementation of the pipeline and its empirical evaluation on a case study consisting of healthcare-related campaigns in the UK, showing how it supports the operational definitions of online activism, by comparing three experimental ranking functions. The code is publicly available.}, booktitle = {procs. {ICWE}'19}, author = {Primo, Flavio and Missier, Paolo and Romanovsky, Alexander and Mickael, Figueredo and Cacho, Nelio}, year = {2019}, keywords = {online activism, twitter analytics}, }
@article{thavasimani_why-diff_2019, title = {Why-{Diff}: {Exploiting} {Provenance} to {Understand} {Outcome} {Differences} from non-identical {Reproduced} {Workflows}}, issn = {2169-3536}, url = {https://ieeexplore.ieee.org/document/8662612/}, doi = {10.1109/ACCESS.2019.2903727}, abstract = {Data analytics processes such as scientific workflows tend to be executed repeatedly, with varying dependencies and input datasets. The case has been made in the past for tracking the provenance of the final information products through the workflow steps, to enable their reproducibility. In this work, we explore the hypothesis that provenance traces recorded during execution are also instrumental to answering questions about the observed differences between sets of results obtained from similar but not identical workflow configurations. Such differences in configurations may be introduced deliberately, i.e., to explore process variations, or accidentally, typically as the result of porting efforts or of changes in the computing environment. Using a commonly used workflow programming model as a reference, we consider both structural variations in the workflows as well as variations within their individual components. Our whydiff algorithm compares the graph representations of two provenance traces derived from two workflow variations. It produces a delta graph that can be used to produce human-readable explanations of the impact of workflow differences on observed output differences. We report on our Neo4j graph database. We also report explanations of difference between workflow results using a suite of synthetic workflows as well as real-world workflows.}, journal = {IEEE Access}, author = {Thavasimani, Priyaa and Cala, Jacek and Missier, Paolo}, year = {2019}, keywords = {eScience Central, Provenance, Big Data, Why-Diff, Workflow, Reproducibility, Software, Alzheimer's disease, Databases, Genetics, Libraries, Sentiment analysis}, pages = {1--1}, }
@article{thompson_targeted_2019, title = {Targeted therapies for congenital myasthenic syndromes: systematic review and steps towards a treatabolome}, url = {http://www.emergtoplifesci.org/content/early/2019/01/25/ETLS20180100.abstract}, doi = {10.1042/ETLS20180100}, abstract = {Despite recent scientific advances, most rare genetic diseases ? including most neuromuscular diseases ? do not currently have curative gene-based therapies available. However, in some cases, such as vitamin, cofactor or enzyme deficiencies, channelopathies and disorders of the neuromuscular junction, a confirmed genetic diagnosis provides guidance on treatment, with drugs available that may significantly alter the disease course, improve functional ability and extend life expectancy. Nevertheless, many treatable patients remain undiagnosed or do not receive treatment even after genetic diagnosis. The growth of computer-aided genetic analysis systems that enable clinicians to diagnose their undiagnosed patients has not yet been matched by genetics-based decision-support systems for treatment guidance. Generating a ?treatabolome' of treatable variants and the evidence for the treatment has the potential to increase treatment rates for treatable conditions. Here, we use the congenital myasthenic syndromes (CMS), a group of clinically and genetically heterogeneous but frequently treatable neuromuscular conditions, to illustrate the steps in the creation of a treatabolome for rare inherited diseases. We perform a systematic review of the evidence for pharmacological treatment of each CMS type, gathering evidence from 207 studies of over 1000 patients and stratifying by genetic defect, as treatment varies depending on the underlying cause. We assess the strength and quality of the evidence and create a dataset that provides the foundation for a computer-aided system to enable clinicians to gain easier access to information about treatable variants and the evidence they need to consider.3,4-DAP, 3,4-diaminopyridine; AChE, acetylcholinesterase; AChR, acetylcholine receptor; CEBM, Centre for evidence-based medicine; CMS, congenital myasthenic syndrome; NGS, next-generation sequencing; NMJ, neuromuscular junction}, journal = {Emerging Topics in Life Sciences}, author = {Thompson, Rachel and Bonne, Gisèle and Missier, Paolo and Lochmüller, Hanns}, month = jan, year = {2019}, pages = {ETLS20180100}, }
@article{cala_selective_2018, title = {Selective and {Recurring} {Re}-computation of {Big} {Data} {Analytics} {Tasks}: {Insights} from a {Genomics} {Case} {Study}}, volume = {13}, issn = {2214-5796}, url = {http://www.sciencedirect.com/science/article/pii/S2214579617303520}, doi = {https://doi.org/10.1016/j.bdr.2018.06.001}, abstract = {The value of knowledge assets generated by analytics processes using Data Science techniques tends to decay over time, as a consequence of changes in the elements the process depends on: external data sources, libraries, and system dependencies. For large-scale problems, refreshing those outcomes through greedy re-computation is both expensive and inefficient, as some changes have limited impact. In this paper we address the problem of refreshing past process outcomes selectively, that is, by trying to identify the subset of outcomes that will have been affected by a change, and by only re-executing fragments of the original process. We propose a technical approach to address the selective re-computation problem by combining multiple techniques, and present an extensive experimental study in Genomics, namely variant calling and their clinical interpretation, to show its effectiveness. In this case study, we are able to decrease the number of required re-computations on a cohort of individuals from 495 (blind) down to 71, and that we can reduce runtime by at least 60\% relative to the naïve blind approach, and in some cases by 90\%. Starting from this experience, we then propose a blueprint for a generic re-computation meta-process that makes use of process history metadata to make informed decisions about selective re-computations in reaction to a variety of changes in the data.}, journal = {Big Data Research}, author = {Cal�a, Jacek and Missier, Paolo}, year = {2018}, keywords = {Big data analysis, Genomics, Knowledge decay, Re-computation}, pages = {76 -- 94}, }
@inproceedings{pimentel_versioned-prov_2018, address = {London}, title = {Versioned-{PROV}: {A} {PROV} extension to support mutable data entities}, booktitle = {Procs. {IPAW} 2018}, publisher = {Springer}, author = {Pimentel, Joao Felipe and Missier, Paolo and Murta, Leonardo and Braganholo, Vanessa}, year = {2018}, keywords = {\#provenance, \#recomputation, process re-computation, provenance annotations}, }
@article{geerts_editorial_2018, title = {Editorial: {Special} {Issue} on {Improving} the {Veracity} and {Value} of {Big} {Data}}, volume = {9}, url = {http://doi.acm.org/10.1145/3174791}, doi = {10.1145/3174791}, number = {3}, journal = {J. Data and Information Quality}, author = {Geerts, Floris and Missier, Paolo and Paton, Norman W.}, year = {2018}, pages = {13:1--13:2}, }
@inproceedings{tucci_design_2018, address = {Bari, italy}, title = {Design and evaluation of a genomics variant analysis pipeline using {GATK} {Spark} tools}, abstract = {Scalable and efficient processing of genome sequence data, i.e. for variant discovery, is key to the mainstream adoption of High Throughput technology for disease prevention and for clinical use. Achieving scalability, however, requires a significant effort to enable the parallel execution of the analysis tools that make up the pipelines. This is facilitated by the new Spark versions of the well-known GATK toolkit, which offer a black-box approach by transparently exploiting the underlying Map Reduce architecture. In this paper we report on our experience implementing a standard variant discovery pipeline using GATK 4.0 with Docker-based deployment over a cluster. We provide a preliminary performance analysis, comparing the processing times and cost to those of the new Microsoft Genomics Services.}, booktitle = {Procs. {SEBD} '18 – {26TH} {Italian} {Symposium} on {Advanced} {Database} {Systems}}, author = {Tucci, Nicholas and Cala, Jacek and Steyn, Jannetta and Missier, Paolo}, year = {2018}, keywords = {\#genomics, \#spark}, }
@inproceedings{cala_provenance_2018, address = {London}, title = {Provenance {Annotation} and {Analysis} to {Support} {Process} {Re}-{Computation}}, abstract = {Many resource-intensive analytics processes evolve over time following new versions of the reference datasets and software dependen- cies they use. We focus on scenarios in which any version change has the potential to affect many outcomes, as is the case for instance in high throughput genomics where the same process is used to analyse large cohorts of patient genomes, or cases. As any version change is unlikely to affect the entire population, an efficient strategy for restoring the cur- rency of the outcomes requires first to identify the scope of a change, i.e., the subset of affected data products. In this paper we describe a generic and reusable provenance-based approach to address this scope discovery problem. It applies to a scenario where the process consists of complex hierarchical components, where different input cases are processed using different version configurations of each component, and where separate provenance traces are collected for the executions of each of the com- ponents. We show how a new data structure, called a restart tree, is computed and exploited to manage the change scope discovery problem.}, booktitle = {Procs. {IPAW} 2018}, publisher = {Springer}, author = {Cala, Jacek and Missier, Paolo}, year = {2018}, keywords = {\#provenance, \#recomputation, process re-computation, provenance annotations}, }
@inproceedings{barros_analyzing_2018, title = {Analyzing {Social} {Network} {Images} with {Deep} {Learning} {Models} to {Fight} {Zika} {Virus}}, booktitle = {Procs. 15th {International} {Conference} on {Image} {Analysis} and {Recognition} ({ICIAR}'18)}, author = {Barros, H. Pedro and Lima, Bruno G. C. and Crispim, Felipe C. and Vieira, Tiago and Missier, Paolo and Fonseca, Baldoino}, year = {2018}, keywords = {\#zika}, }
@article{sousa_vazadengue_2018, title = {{VazaDengue}: {An} information system for preventing and combating mosquito-borne diseases with social networks}, volume = {75}, issn = {0306-4379}, url = {http://www.sciencedirect.com/science/article/pii/S030643791730618X}, doi = {10.1016/j.is.2018.02.003}, journal = {Information Systems}, author = {Sousa, Leonardo and Mello, Rafael de and Cedrim, Diego and Garcia, Alessandro and Missier, Paolo and Uch�a, Anderson and Oliveira, Anderson and Romanovsky, Alexander}, year = {2018}, keywords = {Dengue, Mosquito, Social media, Surveillance, Tweets}, pages = {26 -- 42}, }
@inproceedings{firth_loom_2018, address = {Vienna, Austria}, title = {Loom: {Query}-aware {Partitioning} of {Online} {Graphs}}, url = {http://edbticdt2018.at/}, abstract = {As with general graph processing systems, partitioning data over a cluster of machines improves the scalability of graph database management systems. However, these systems will incur additional network cost during the execution of a query workload, due to inter- partition traversals. Workload-agnostic partitioning algorithms typically minimise the likelihood of any edge crossing partition boundaries. However, these partitioners are sub-optimal with re- spect to many workloads, especially queries, which may require more frequent traversal of speci c subsets of inter-partition edges. Furthermore, they largely unsuited to operating incrementally on dynamic, growing graphs. We present a new graph partitioning algorithm, Loom, that op- erates on a stream of graph updates and continuously allocates the new vertices and edges to partitions, taking into account a query workload of graph pattern expressions along with their relative frequencies. First we capture the most common patterns of edge traversals which occur when executing queries. We then compare sub-graphs, which present themselves incrementally in the graph update stream, against these common patterns. Finally we attempt to allocate each match to single partitions, reducing the number of inter-partition edges within frequently traversed sub-graphs and improving average query performance. Loom is extensively evaluated over several large test graphs with realistic query workloads and various orderings of the graph updates. We demonstrate that, given a workload, our prototype produces partitionings of signi cantly better quality than existing streaming graph partitioning algorithms Fennel \& LDG.}, booktitle = {Procs. 21st {International} {Conference} on {Extending} {Database} {Technology} ({EDBT})}, publisher = {EDBT}, author = {Firth, H and Missier, P}, year = {2018}, keywords = {distributed graphs, graph partitioning}, }
@inproceedings{thavasimani_why-diff_2017, title = {Why-{Diff}: {Explaining} differences amongst similar workflow runs by exploiting scientific metadata}, url = {https://doi.org/10.1109/BigData.2017.8258275}, doi = {10.1109/BigData.2017.8258275}, booktitle = {2017 {IEEE} {International} {Conference} on {Big} {Data}, {BigData} 2017, {Boston}, {MA}, {USA}, {December} 11-14, 2017}, author = {Thavasimani, Priyaa and Cala, Jacek and Missier, Paolo}, year = {2017}, pages = {3031--3041}, }
@inproceedings{gu_adaptive_2017, address = {Orleans, France}, title = {Adaptive {Incremental} {Learning} for {Statistical} {Relational} {Models} {Using} {Gradient}-{Based} {Boosting}}, url = {https://ilp2017.sciencesconf.org/data/pages/ILP_2017_paper_27.pdf}, abstract = {We consider the problem of incrementally learning models from relational data. Most existing learning methods for statistical relational models use batch learning, which becomes computationally expensive and eventually infeasible for large datasets. The majority of the previous work in relational incremental learning assumes the model's structure is given and only the model's parameters needed to be learned. In this paper, we propose algorithms that can incrementally learn the model's parameters and structure simultaneously. These algorithms are based on the successful formalisation of the relational functional gradient boosting system (RFGB), and extend the classical propositional ensemble methods to relational learning for handling evolving data streams.}, booktitle = {Procs. {ILP} '17, 27th {International} {Conference} on {Inductive} {Logic} {Programming} (late-breaking paper)}, publisher = {CEUR-WS}, author = {Gu, Yulong and Missier, Paolo}, year = {2017}, }
@inproceedings{missier_mind_2017, address = {Linz,Austria}, title = {Mind {My} {Value}: a {Decentralized} {Infrastructure} for {Fair} and {Trusted} {IoT} {Data} {Trading}}, url = {http://iot-conference.org/iot2017/}, abstract = {Internet of Things (IoT) data are increasingly viewed as a new form of massively distributed and large scale digital assets, which are continuously generated by millions of connected devices. The real value of such assets can only be realized by allowing IoT data trading to occur on a marketplace that rewards every single producer and consumer, at a very granular level. Crucially, we believe that such a marketplace should not be owned by anybody, and should instead fairly and transparently self-enforce a well defined set of governance rules. In this paper we address some of the technical challenges involved in realizing such a marketplace. We leverage emerging blockchain technologies to build a decentralized, trusted, transparent and open architecture for IoT traffic metering and contract compliance, on top of the largely adopted IoT brokered data infrastructure. We discuss an Ethereum-based prototype implementation and experimentally evaluate the overhead cost associated with Smart Contract transactions, concluding that a viable business model can indeed be associated with our technical approach.}, booktitle = {Procs. 7th {International} {Conference} on the {Internet} of {Things}}, author = {Missier, Paolo and Bajoudah, Shaimaa and Capossele, Angelo and Gaglione, Andrea and Nati, Michele}, year = {2017}, keywords = {\#IoT, \#marketplace}, }
@inproceedings{missier_preserving_2017, title = {Preserving the value of large scale data analytics over time through selective re-computation}, booktitle = {Procs. 31st {British} {International} {Conference} on {Databases} - {BICOD}}, author = {Missier, Paolo and Cala, Jacek and Rathi, Manisha}, year = {2017}, }
@inproceedings{missier_recruiting_2017, address = {Roma, Italy}, title = {Recruiting from the {Network}: {Discovering} {Twitter} {Users} {Who} {Can} {Help} {Combat} {Zika} {Epidemics}}, isbn = {978-3-319-60131-1}, url = {http://dx.doi.org/10.1007/978-3-319-60131-1_30}, doi = {10.1007/978-3-319-60131-1_30}, booktitle = {Web {Engineering}: 17th {International} {Conference}, {ICWE} 2017, {Rome}, {Italy}, {June} 5-8, 2017, {Proceedings}}, publisher = {Springer International Publishing}, author = {Missier, Paolo and McClean, Callum and Carlton, Jonathan and Cedrim, Diego and Silva, Leonardo and Garcia, Alessandro and Plastino, Alexandre and Romanovsky, Alexander}, editor = {Cabot, Jordi and De Virgilio, Roberto and Torlone, Riccardo}, year = {2017}, pages = {437--445}, }
@incollection{missier_provenance_2017, address = {New York, NY}, title = {Provenance {Standards}}, isbn = {978-1-4899-7993-3}, url = {https://doi.org/10.1007/978-1-4899-7993-3_80749-1}, booktitle = {Encyclopedia of {Database} {Systems}}, publisher = {Springer New York}, author = {Missier, Paolo}, editor = {Liu, Ling and Özsu, M Tamer}, year = {2017}, doi = {10.1007/978-1-4899-7993-3_80749-1}, pages = {1--8}, }
@article{firth_taper_2017, title = {{TAPER}: query-aware, partition-enhancement for large, heterogenous graphs}, issn = {1573-7578}, url = {http://dx.doi.org/10.1007/s10619-017-7196-y}, doi = {10.1007/s10619-017-7196-y}, abstract = {Graph partitioning has long been seen as a viable approach to addressing Graph DBMS scalability. A partitioning, however, may introduce extra query processing latency unless it is sensitive to a specific query workload, and optimised to minimise inter-partition traversals for that workload. Additionally, it should also be possible to incrementally adjust the partitioning in reaction to changes in the graph topology, the query workload, or both. Because of their complexity, current partitioning algorithms fall short of one or both of these requirements, as they are designed for offline use and as one-off operations. The TAPER system aims to address both requirements, whilst leveraging existing partitioning algorithms. TAPER takes any given initial partitioning as a starting point, and iteratively adjusts it by swapping chosen vertices across partitions, heuristically reducing the probability of inter-partition traversals for a given path queries workload. Iterations are inexpensive thanks to time and space optimisations in the underlying support data structures. We evaluate TAPER on two different large test graphs and over realistic query workloads. Our results indicate that, given a hash-based partitioning, TAPER reduces the number of inter-partition traversals by \$\${\textbackslash}textbackslashbackslashsim \$\$ ? 80\%; given an unweighted Metis partitioning, by \$\${\textbackslash}textbackslashbackslashsim \$\$ ? 30\%. These reductions are achieved within eight iterations and with the additional advantage of being workload-aware and usable online.}, journal = {Distributed and Parallel Databases}, author = {Firth, Hugo and Missier, Paolo}, year = {2017}, pages = {1--31}, }
@inproceedings{zhang_revealing_2017, address = {Edinburgh, Scotland, UK}, title = {Revealing the {Detailed} {Lineage} of {Script} {Outputs} using {Hybrid} {Provenance}}, abstract = {We illustrate how combining retrospective and prospective provenance can yield scientifically meaningful hybrid provenance representations of the computational histories of data produced during a script run. We use scripts from multiple disciplines (astrophysics, climate science, biodiversity data curation, and social network analysis), implemented in Python, R, and MATLAB, to highlight the usefulness of diverse forms of retrospective provenance when coupled with prospective provenance. Users provide prospective provenance (i.e., the conceptual workflows latent in scripts) via simple YesWorkflow annotations, embedded as script comments. Runtime observables, hidden in filenames or folder structures, recorded in log-files, or automatically captured using tools such as noWorkflow or the DataONE RunManagers can be linked to prospective provenance via relational views and queries. The YesWorkflow toolkit, example scripts, and demonstration code are available via an open source repository.}, booktitle = {Procs. 11th {Intl}. {Digital} {Curation} {Conference} ({IDCC})}, publisher = {Digital Curation Center}, author = {Zhang, Qian and Cao, Yang and Wang, Qiwen and Vu, Duc and Thavasimani, Priyaa and McPhillips, Tim and Missier, Paolo and Slaughter, Peter and Jones, Christopher and Jones, Matthew B and Ludascher, Bertram}, year = {2017}, keywords = {\#provenance}, }
@inproceedings{thavasimani_facilitating_2016, title = {Facilitating reproducible research by investigating computational metadata}, url = {https://doi.org/10.1109/BigData.2016.7840958}, doi = {10.1109/BigData.2016.7840958}, booktitle = {2016 {IEEE} {International} {Conference} on {Big} {Data}, {BigData} 2016, {Washington} {DC}, {USA}, {December} 5-8, 2016}, author = {Thavasimani, Priyaa and Missier, Paolo}, year = {2016}, pages = {3045--3051}, }
@inproceedings{karsai_clustering_2016, address = {New York, NY, USA}, series = {{HILDA} '16}, title = {Clustering {Provenance} {Facilitating} {Provenance} {Exploration} {Through} {Data} {Abstraction}}, isbn = {978-1-4503-4207-0}, url = {http://doi.acm.org/10.1145/2939502.2939508}, doi = {10.1145/2939502.2939508}, booktitle = {Proceedings of the {Workshop} on {Human}-{In}-the-{Loop} {Data} {Analytics}}, publisher = {ACM}, author = {Karsai, Linus and Fekete, Alan and Kay, Judy and Missier, Paolo}, year = {2016}, keywords = {provenance, large-scale graphs, visualisation}, pages = {6:1--6:5}, }
@article{burgess_alan_2016, title = {Alan {Turing} {Intitute} {Symposium} on {Reproducibioity} for {Data}-{Intensive} {Research} – {Final} {Report}}, url = {https://dx.doi.org/10.6084/m9.figshare.3487382}, author = {Burgess, Lucie C and Crotty, David and de Roure, David and Gibbons, Jeremy and Goble, Carole and Missier, Paolo and Mortier, Richard and Nichols, Thomas E and O�Beirne, Richard}, year = {2016}, }
@incollection{missier_lifecycle_2016, title = {The lifecycle of provenance metadata and its associated challenges and opportunities}, volume = {Springer}, url = {http://arxiv.org/abs/1605.01229}, abstract = {This chapter outlines some of the challenges and opportunities associated with adopting provenance principles [CFLV12] and stan- dards [MGC+15] in a variety of disciplines, including data publication and reuse, and information sciences.}, booktitle = {Building {Trust} in {Financial} {Information} - {Perspectives} on the {Frontiers} of {Provenance}.}, publisher = {Springer}, author = {Missier, Paolo}, editor = {Lemieux, Victoria}, year = {2016}, keywords = {\#provenance}, }
@inproceedings{missier_data_2016, address = {Washington D.C., USA}, title = {The data, they are a-changin'}, url = {https://arxiv.org/abs/1604.06412}, abstract = {The cost of deriving actionable knowledge from large datasets has been decreasing thanks to a convergence of positive fac- tors: low cost data generation, inexpensively scalable stor- age and processing infrastructure (cloud), software frame- works and tools for massively distributed data processing, and parallelisable data analytics algorithms. One observa- tion that is often overlooked, however, is that each of these elements is not immutable, rather they all evolve over time. This suggests that the value of such derivative knowledge may decay over time, unless it is preserved by reacting to those changes. Our broad research goal is to develop mod- els, methods, and tools for selectively reacting to changes by balancing costs and benefits, i.e. through complete or partial re-computation of some of the underlying processes. In this paper we present an initial model for reasoning about change and re-computations, and show how analysis of detailed provenance of derived knowledge informs re-computation decisions. We illustrate the main ideas through a real-world case study in genomics, namely on the interpretation of hu- man variants in support of genetic diagnosis.}, booktitle = {Proc. {TAPP}'16 ({Theory} and {Practice} of {Provenance})}, publisher = {USENIX Association}, author = {Missier, Paolo and Cala, Jacek and Wijaya, Eldarina}, editor = {Cohen-Boulakia, Sarah}, year = {2016}, keywords = {\#provenance, \#re-computation, \#big data processing, \#data change}, }
@inproceedings{oliveira_analyzing_2016, address = {Washington D.C., USA}, title = {Analyzing {Provenance} across {Heterogeneous} {Provenance} {Graphs}}, abstract = {Provenance generated by different workflow systems is generally ex- pressed using different formats. This is not an issue when scientists analyze provenance graphs in isolation, or when they use the same workflow system. However, when analyzing heterogeneous provenance graphs from multiple systems poses a challenge. To address this problem we adopt ProvONE as an integration model, and show how different provenance databases can be con- verted to a global ProvONE schema. Scientists can then query this integrated database, exploring and linking provenance across several different workflows that may represent different implementations of the same experiment. To illus- trate the feasibility of our approach, we developed conceptual mappings be- tween the provenance databases of two workflow systems (e-Science Central and SciCumulus). We provide cartridges that implement these mappings and generate an integrated provenance database expressed as Prolog facts. To demonstrate its usage, we have developed Prolog rules that enable scientists to query the integrated database.}, booktitle = {Procs. {IPAW} 2016}, publisher = {Springer}, author = {Oliveira, Wellington and Missier, Paolo and Ocana, Kary and de Oliveira, Daniel and Braganholo, Vanessa}, year = {2016}, keywords = {\#provenance}, }
@inproceedings{missier_tracking_2016, address = {Lugano, Switzerland}, title = {Tracking {Dengue} {Epidemics} using {Twitter} {Content} {Classification} and {Topic} {Modelling}}, url = {http://arxiv.org/abs/1605.00968}, abstract = {Detecting and preventing outbreaks of mosquito-borne diseases such as Dengue and Zika in Brasil and other tropical regions has long been a priority for governments in affected areas. Streaming social media content, such as Twit- ter, is increasingly being used for health vigilance applications such as flu detec- tion. However, previous work has not addressed the complexity of drastic sea- sonal changes on Twitter a across multiple epidemic outbreaks. In order to address this gap, this paper contrasts two complementary approaches to detecting Twitter content that is relevant for Dengue outbreak detection, namely supervised classification and unsupervised clustering using topic modelling. Each approach has benefits and shortcomings. Our classifier achieves a prediction accuracy of about 80\% based on a small training set of about 1,000 instances, but the need for manual annotation makes it hard to track seasonal changes in the nature of the epidemics, such as the emergence of new types of virus in certain geographical locations. In contrast, LDA-based topic modelling scales well, generating cohe- sive and well-separated clusters from larger samples. While clusters can be easily re-generated following changes in epidemics, however, this approach makes it hard to clearly segregate relevant tweets into well-defined clusters.}, booktitle = {Procs. {SoWeMine} workshop, co-located with {ICWE} 2016}, author = {Missier, Paolo and Romanovsky, A and Miu, T and Pal, A and Daniilakis, M and Garcia, A and Cedrim, D and Sousa, L}, year = {2016}, keywords = {\#social media analytics, \#twitter analytics}, }
@inproceedings{firth_workload-aware_2016, address = {Bordeaux}, title = {Workload-aware streaming graph partitioning}, booktitle = {Procs. {GraphQ} {Workshop}, co-located with {EDBT}'16}, author = {Firth, Hugo and Missier, Paolo}, year = {2016}, }
@article{missier_data_2016-1, title = {Data trajectories: tracking reuse of published data for transitive credit attribution}, volume = {11}, url = {http://bibbase.org/network/publication/missier-datatrajectoriestrackingreuseofpublisheddatafortransitivecreditattribution-2016}, doi = {doi:10.2218/ijdc.v11i1.425}, abstract = {The ability to measure the use and impact of published data sets is key to the success of the open data / open science paradigm. A direct measure of impact would require tracking data (re)use in the wild, which however is difficult to achieve. This is therefore commonly replaced by simpler metrics based on data download and citation counts. In this paper we describe a scenario where it is possible to track the trajectory of a dataset after its publication, and we show how this enables the design of accurate models for ascribing credit to data originators. A Data Trajectory (DT) is a graph that encodes knowledge of how, by whom, and in which context data has been re-used, possibly after several generations. We provide a theoretical model of DTs that is grounded in the W3C PROV data model for provenance, and we show how DTs can be used to automatically propagate a fraction of the credit associated with transitively derived datasets, back to original data contributors. We also show this model of transitive credit in action by means of a Data Reuse Simulator. Ultimately, our hope is that, in the longer term, credit models based on direct measures of data reuse will provide further incentives to data publication. We conclude by outlining a research agenda to address the hard questions of creating, collecting, and using DTs systematically across a large number of data reuse instances, in the wild.}, number = {1}, journal = {International Journal of Digital Curation}, author = {Missier, Paolo}, year = {2016}, keywords = {provenance, data reuse, data trajectories}, pages = {1--16}, }
@article{cala_scalable_2016, title = {Scalable and {Efficient} {Whole}-exome {Data} {Processing} {Using} {Workflows} on the {Cloud}}, volume = {In press}, abstract = {Dataflow-style workflows offer a simple, high-level programming model for flexible prototyping of scientific applications as an attractive alternative to low-level scripting. At the same time, workflow management systems (WFMS) may support data parallelism over big datasets by providing scalable, distributed deployment and execution of the workflow over a cloud infrastructure. In theory, the combination of these properties makes workflows a natural choice for implementing Big Data processing pipelines, common for instance in bioinformatics. In practice, however, correct workflow design for parallel Big Data problems can be complex and very time-consuming. In this paper we present our experience in porting a genomics data processing pipeline from an existing scripted implementation deployed on a closed HPC cluster, to a workflow-based design deployed on the Microsoft Azure public cloud. We draw two contrasting and general conclusions from this project. On the positive side, we show that our solution based on the e-Science Central WFMS and deployed in the cloud clearly outperforms the original HPC-based implementation achieving up to 2.3x speed-up. However, in order to deliver such performance we describe the importance of optimising the workflow deployment model to best suit the characteristics of the cloud computing infrastructure. The main reason for the performance gains was the availability of fast, node-local SSD disks delivered by D-series Azure VMs combined with the implicit use of local disk resources by e-Science Central workflow engines. These conclusions suggest that, on parallel Big Data problems, it is important to couple understanding of the cloud computing architecture and its software stack with simplicity of design, and that further efforts in automating parallelisation of complex pipelines are required.}, number = {Special Issue: Big Data in the Cloud - Best paper award at the FGCS forum 2016}, journal = {Future Generation Computer Systems}, author = {Cala, Jacek and Marei, Eyad and Yu, Yaobo and Takeda, Kenji and Missier, Paolo}, year = {2016}, keywords = {workflow, Performance analysis, Cloud computing, HPC, Whole-exome sequencing, Workflow-based application, cloud, genomics, ?}, }
@inproceedings{miu_bootstrapping_2015, title = {Bootstrapping {Personalised} {Human} {Activity} {Recognition} {Models} {Using} {Online} {Active} {Learning}}, booktitle = {Proceedings of the 14th {IEEE} {International} {Conference} on {Ubiquitous} {Computing} and {Communications}}, author = {Miu, T. and Missier, P. and Plötz, T.}, year = {2015}, }
@inproceedings{missier_svi_2015, address = {Los Angeles, CA}, title = {{SVI}: a simple single-nucleotide {Human} {Variant} {Interpretation} tool for {Clinical} {Use}}, booktitle = {Procs. 11th {International} conference on {Data} {Integration} in the {Life} {Sciences}}, publisher = {Springer}, author = {Missier, Paolo and Wijaya, Eldarina and Kirby, Ryan and Keogh, Michael}, year = {2015}, keywords = {\#NGS, \#variant interpretation}, }
@article{danger_access_2015, title = {Access control and view generation for provenance graphs}, volume = {49}, issn = {0167739X}, url = {http://www.sciencedirect.com/science/article/pii/S0167739X1500031X}, doi = {10.1016/j.future.2015.01.014}, abstract = {Data provenance refers to the knowledge about data sources and operations carried out to obtain some piece of data. A provenance-enabled system maintains record of the interoperation of processes across different modules, stages and authorities to capture the full lineage of the resulting data, and typically allows data-focused audits using semantic technologies, such as ontologies, that capture domain knowledge. However, regulating access to captured provenance data is a non-trivial problem, since execution records form complex, overlapping graphs with individual nodes possibly being subject to different access policies. Applying traditional access control to provenance queries can either hide from the user the entire graph with nodes that had access to them denied, reveal too much information, or return a semantically invalid graph. An alternative approach is to answer queries with a new graph that abstracts over the missing nodes and fragments. In this paper, we present TACLP, an access control language for provenance data that supports this approach, together with an algorithm that transforms graphs according to sets of access restrictions. The algorithm produces safe and valid provenance graphs that retain the maximum amount of information allowed by the security model. The approach is demonstrated on an example of restricting access to a clinical trial provenance trace.}, journal = {Future Generation Computer Systems}, author = {Danger, Roxana and Curcin, Vasa and Missier, Paolo and Bryans, Jeremy}, month = feb, year = {2015}, keywords = {Provenance, Access Control Language, Semantic Web}, pages = {8--27}, }
@article{hidders_recent_2015, title = {Recent advances in {Scalable} {Workflow} {Enactment} {Engines} and {Technologies}}, volume = {46}, issn = {0167739X}, url = {http://www.sciencedirect.com/science/article/pii/S0167739X15000047}, doi = {10.1016/j.future.2015.01.003}, journal = {Future Generation Computer Systems}, author = {Hidders, Jan and Missier, Paolo and Sroka, Jacek}, month = may, year = {2015}, pages = {1--2}, }
@inproceedings{miu_strategies_2014, address = {New York, NY, USA}, series = {{UbiComp} '14 {Adjunct}}, title = {On {Strategies} for {Budget}-based {Online} {Annotation} in {Human} {Activity} {Recognition}}, isbn = {978-1-4503-3047-3}, url = {http://doi.acm.org/10.1145/2638728.2641300}, doi = {10.1145/2638728.2641300}, booktitle = {Proceedings of the 2014 {ACM} {International} {Joint} {Conference} on {Pervasive} and {Ubiquitous} {Computing}: {Adjunct} {Publication}}, publisher = {ACM}, author = {Miu, Tudor and Plötz, Thomas and Missier, Paolo and Roggen, Daniel}, year = {2014}, keywords = {activity recognition, online learning, budget-based annotation}, pages = {767--776}, }
@incollection{embury_forget_2014, series = {Synthese {Library}}, title = {Forget {Dimensions}: {Define} {Your} {Information} {Quality} {Using} {Quality} {View} {Patterns}}, volume = {358}, isbn = {978-3-319-07120-6}, url = {http://dx.doi.org/10.1007/978-3-319-07121-3_3}, language = {English}, booktitle = {The {Philosophy} of {Information} {Quality} {SE} - 3}, publisher = {Springer International Publishing}, author = {Embury, SuzanneM. and Missier, Paolo}, editor = {Floridi, Luciano and Illari, Phyllis}, year = {2014}, doi = {10.1007/978-3-319-07121-3_3}, keywords = {\#information quality}, pages = {25--41}, }
@inproceedings{garcia-constantino_measuring_2014, title = {Measuring the impact of cognitive distractions on driving performance using time series analysis}, url = {http://arxiv.org/abs/1408.5573}, abstract = {Using current sensing technology, a wealth of data on driving sessions is potentially available through a combination of vehicle sensors and drivers' physiology sensors (heart rate, breathing rate, skin temperature, etc.). Our hypothesis is that it should be possible to exploit the combination of time series produced by such multiple sensors during a driving session, in order to (i) learn models of normal driving behaviour, and (ii) use such models to detect important and potentially dangerous deviations from the norm in real-time, and thus enable the generation of appropriate alerts. Crucially, we believe that such models and interventions should and can be personalised and tailor-made for each individual driver. As an initial step towards this goal, in this paper we present techniques for assessing the impact of cognitive distraction on drivers, based on simple time series analysis. We have tested our method on a rich dataset of driving sessions, carried out in a professional simulator, involving a panel of volunteer drivers. Each session included a different type of cognitive distraction, and resulted in multiple time series from a variety of on-board sensors as well as sensors worn by the driver. Crucially, each driver also recorded an initial session with no distractions. In our model, such initial session provides the baseline times series that make it possible to quantitatively assess driver performance under distraction conditions.}, booktitle = {Procs. {IEEE} conference on {Intelligent} {Transport} {Systems} ({ITSC}'14)}, author = {Garcia-Constantino, Matias and Missier, Paolo and Guo, Phil Blytheand Amy Weihong}, month = aug, year = {2014}, keywords = {\#ITS}, }
@article{mearns_tweet_2014, title = {Tweet {My} {Street}: {A} {Cross}-{Disciplinary} {Collaboration} for the {Analysis} of {Local} {Twitter} {Data}}, volume = {6}, issn = {1999-5903}, url = {http://www.mdpi.com/1999-5903/6/2/378}, doi = {10.3390/fi6020378}, number = {2}, journal = {Future Internet}, author = {Mearns, Graeme and Simmonds, Rebecca and Richardson, Ranald and Turner, Mark and Watson, Paul and Missier, Paolo}, year = {2014}, pages = {378--396}, }
@inproceedings{firth_provgen_2014, address = {Koln, Germany}, title = {{ProvGen}: generating synthetic {PROV} graphs with predictable structure}, booktitle = {Procs. {IPAW} 2014 ({Provenance} and {Annotations})}, publisher = {Springer}, author = {Firth, Hugo and Missier, Paolo}, year = {2014}, }
@inproceedings{missier_provabs_2014, address = {Koln, Germany}, title = {{ProvAbs}: model, policy, and tooling for abstracting {PROV} graphs}, booktitle = {Procs. {IPAW} 2014 ({Provenance} and {Annotations})}, publisher = {Springer}, author = {Missier, Paolo and Bryans, Jeremy and Gamble, Carl and Curcin, Vasa and Danger, Roxana}, year = {2014}, }
@inproceedings{cala_scripted_2014, address = {Chicago, IL}, title = {From scripted {HPC}-based {NGS} pipelines to workflows on the cloud}, booktitle = {Procs. {C4Bio} workshop, co-located with the 2014 {CCGrid} conference}, publisher = {IEEE}, author = {Cala, Jacek and Xu, Yaobo Xu and Wijaya, Eldarina Azfar and Missier, Paolo}, year = {2014}, keywords = {workflow, scientific workflows, NGS, pipeline}, }
@inproceedings{cuevas-vicenttin_pbase_2014, address = {San Francisco, CA, USA}, title = {The {PBase} {Scientific} {Workflow} {Provenance} {Repository}}, booktitle = {Procs. 9th {International} {Digital} {Curation} {Conference}}, author = {Cuevas-Vicenttín, Víctor and Kianmajd, Parisa and Ludäscher, Bertram and Missier, Paolo and Chirigati, Fernando and Wei, Yaxing and Koop, David and Dey, Saumen}, year = {2014}, keywords = {\#provenance, \#workflow, \#DataONE}, }
@article{cohen-boulakia_distilling_2014, title = {Distilling structure in {Taverna} scientific workflows: a refactoring approach}, volume = {15}, issn = {1471-2105}, url = {http://www.biomedcentral.com/1471-2105/15/S1/S12}, doi = {10.1186/1471-2105-15-S1-S12}, abstract = {BACKGROUND:Scientific workflows management systems are increasingly used to specify and manage bioinformatics experiments. Their programming model appeals to bioinformaticians, who can use them to easily specify complex data processing pipelines. Such a model is underpinned by a graph structure, where nodes represent bioinformatics tasks and links represent the dataflow. The complexity of such graph structures is increasing over time, with possible impacts on scientific workflows reuse. In this work, we propose effective methods for workflow design, with a focus on the Taverna model. We argue that one of the contributing factors for the difficulties in reuse is the presence of "anti-patterns", a term broadly used in program design, to indicate the use of idiomatic forms that lead to over-complicated design. The main contribution of this work is a method for automatically detecting such anti-patterns, and replacing them with different patterns which result in a reduction in the workflow's overall structural complexity. Rewriting workflows in this way will be beneficial both in terms of user experience (easier design and maintenance), and in terms of operational efficiency (easier to manage, and sometimes to exploit the latent parallelism amongst the tasks).RESULTS:We have conducted a thorough study of the workflows structures available in Taverna, with the aim of finding out workflow fragments whose structure could be made simpler without altering the workflow semantics. We provide four contributions. Firstly, we identify a set of anti-patterns that contribute to the structural workflow complexity. Secondly, we design a series of refactoring transformations to replace each anti-pattern by a new semantically-equivalent pattern with less redundancy and simplified structure. Thirdly, we introduce a distilling algorithm that takes in a workflow and produces a distilled semantically-equivalent workflow. Lastly, we provide an implementation of our refactoring approach that we evaluate on both the public Taverna workflows and on a private collection of workflows from the BioVel project.CONCLUSION:We have designed and implemented an approach to improving workflow structure by way of rewriting preserving workflow semantics. Future work includes considering our refactoring approach during the phase of workflow design and proposing guidelines for designing distilled workflows.}, number = {Suppl 1}, journal = {BMC Bioinformatics}, author = {Cohen-Boulakia, Sarah and Chen, Jiuqiang and Missier, Paolo and Goble, Carole and Williams, Alan and Froidevaux, Christine}, year = {2014}, keywords = {\#workflow, \#taverna}, pages = {S12}, }
@inproceedings{missier_extracting_2013, title = {Extracting {PROV} provenance traces from {Wikipedia} history pages}, booktitle = {{EDBT}/{ICDT} {Workshops}}, author = {Missier, Paolo and Chen, Ziyu}, year = {2013}, pages = {327--330}, }
@book{hidders_fundamenta_2013, title = {Fundamenta {Informaticae} – {Special} issue on {Scalable} {Workflow} {Enactment} {Engines} and {Technology}}, volume = {128}, url = {http://iospress.metapress.com/content/n8802x1448hr/?p=c2c17be2c8c64e1195aaa3c93db188c6&pi=1}, number = {3}, publisher = {IOS Press}, editor = {Hidders, Jan and Missier, Paolo and Sroka, Jacek}, year = {2013}, keywords = {\#workflow, \#cloud}, }
@article{missier_provenance_2013, title = {Provenance and data differencing for workflow reproducibility analysis}, issn = {1532-0634}, url = {http://dx.doi.org/10.1002/cpe.3035}, doi = {10.1002/cpe.3035}, abstract = {One of the foundations of science is that researchers must publish the methodology used to achieve their results so that others can attempt to reproduce them. This has the added benefit of allowing methods to be adopted and adapted for other purposes. In the field of e-Science, services – often choreographed through workflow, process data to generate results. The reproduction of results is often not straightforward as the computational objects may not be made available or may have been updated since the results were generated. For example, services are often updated to fix bugs or improve algorithms. This paper addresses these problems in three ways. Firstly, it introduces a new framework to clarify the range of meanings of ‘reproducibility’. Secondly, it describes a new algorithm, PDIFF, that uses a comparison of workflow provenance traces to determine whether an experiment has been reproduced; the main innovation is that if this is not the case then the specific point(s) of divergence are identified through graph analysis, assisting any researcher wishing to understand those differences. One key feature is support for user-defined, semantic data comparison operators. Finally, the paper describes an implementation of PDIFF that leverages the power of the e-Science Central platform that enacts workflows in the cloud. As well as automatically generating a provenance trace for consumption by PDIFF, the platform supports the storage and reuse of old versions of workflows, data and services; the paper shows how this can be powerfully exploited to achieve reproduction and reuse. Copyright © 2013 John Wiley \& Sons, Ltd.}, journal = {Concurrency and Computation: Practice and Experience}, author = {Missier, Paolo and Woodman, Simon and Hiden, Hugo and Watson, Paul}, year = {2013}, keywords = {provenance, reproducibility, e-science, scientific workflow}, }
@inproceedings{missier_w3c_2013, address = {Genova, Italy}, title = {The {W3C} {PROV} family of specifications for modelling provenance metadata}, url = {http://www.edbt.org/Proceedings/2013-Genova/papers/edbt/a80-missier.pdf}, abstract = {Provenance, a form of structured metadata designed to record the origin or source of information, can be instrumental in deciding whether information is to be trusted, how it can be integrated with other diverse information sources, and how to establish attribution of information to authors through- out its history. The PROV set of speci cations, produced by the World Wide Web Consortium (W3C), is designed to pro- mote the publication of provenance information on the Web, and o ers a basis for interoperability across diverse prove- nance management systems. The PROV provenance model is deliberately generic and domain-agnostic, but extension mechanisms are available and can be exploited for modelling speci c domains. This tutorial provides an account of these speci cations. Starting from intuitive and informal exam- ples that present idiomatic provenance patterns, it progres- sively introduces the relational model of provenance along with the constraints model for validation of provenance doc- uments, and concludes with example applications that show the extension points in use.}, booktitle = {Procs. {EDBT}'13 ({Tutorial})}, publisher = {ACM}, author = {Missier, Pa