BibBase https://bibbase.org/f/MTSG9SdhWPisKNpZX/MyPublications-bibbase.bib

2025 (3)

Building ADMISSION – A research collaborative to transform understanding of multiple long-term conditions for people admitted to hospital. Witham, M. D; Bartle, V.; Bellass, S.; Bunn, J. G; Cartner, D.; Cordell, H. J; Doal, R.; Evison, F.; Gallier, S.; Harris, S.; Hillman, S. J; Holding, R.; Leroux, P.; Marshall, T.; Matthews, F. E; Missier, P.; Nair, A.; Osman, M.; Pearson, E. R; Plummer, C.; Pretorius, S.; Richardson, S. J; Robinson, S. M; Sapey, E.; Scharf, T.; Shah, R.; Shahmandi, M.; Singer, M.; Suklan, J.; Wason, J. M.; Cooper, R.; and Sayer, A. A Journal of Multimorbidity and Comorbidity, 15: 26335565251317940. 2025.

Building ADMISSION – A research collaborative to transform understanding of multiple long-term conditions for people admitted to hospital [link]

Paper doi link bibtex abstract

@article{doi:10.1177/26335565251317940,
author = {Miles D Witham and Victoria Bartle and Sue Bellass and Jonathan G Bunn and Duncan Cartner and Heather J Cordell and Rominique Doal and Felicity Evison and Suzy Gallier and Steve Harris and Susan J Hillman and Ray Holding and Peta Leroux and Tom Marshall and Fiona E Matthews and Paolo Missier and Anand Nair and Mo Osman and Ewan R Pearson and Chris Plummer and Sara Pretorius and Sarah J Richardson and Sian M Robinson and Elizabeth Sapey and Thomas Scharf and Rupal Shah and Marzieh Shahmandi and Mervyn Singer and Jana Suklan and James MS Wason and Rachel Cooper and Avan A Sayer},
title ={Building ADMISSION – A research collaborative to transform understanding of multiple long-term conditions for people admitted to hospital},

journal = {Journal of Multimorbidity and Comorbidity},
volume = {15},
number = {},
pages = {26335565251317940},
year = {2025},
doi = {10.1177/26335565251317940},

URL = { 
    
        https://doi.org/10.1177/26335565251317940
    
    

},
eprint = { 
    
        https://doi.org/10.1177/26335565251317940
    
    

}
,
    abstract = { Background Multiple long-term conditions (MLTCs; commonly referred to as multimorbidity) are highly prevalent among people admitted to hospital and are therefore of critical importance to hospital-based healthcare systems. To date, most research on MLTCs has been conducted in primary care or the general population with comparatively little work undertaken in the hospital setting.Purpose To describe the rationale and content of ADMISSION: a four-year UK Research and Innovation and National Institute of Health and Care Research funded interdisciplinary programme that seeks, in partnership with public contributors, to transform care for people living with MLTCs admitted to hospital.Research design Based across five UK academic centres, ADMISSION combines expertise in clinical medicine, epidemiology, informatics, computing, biostatistics, social science, genetics and care pathway mapping to examine patterns of conditions, mechanisms, consequences and pathways of care for people with MLTCs admitted to hospital.Data collection The programme uses routinely collected electronic health record data from large UK teaching hospitals, population-based cohort data from UK Biobank and routinely collected blood samples from The Scottish Health Research Register and Biobank (SHARE). These approaches are complemented by focused qualitative work exploring the perspectives of healthcare professionals and the lived experience of people with MLTCs admitted to hospital.Conclusion ADMISSION will provide the necessary foundations to develop novel ways to prevent and treat MLTCs and their consequences in people admitted to hospital and to improve care systems and the quality of care for this underserved group. }
}

PROLIT: Supporting the Transparency of Data Preparation Pipelines through Narratives over Data Provenance. Lazzaro, P. L.; Lazzaro, M.; Missier, P.; and Torlone, R. In Procs. EDBT (Demo track), Barcelona, Spain, 2025. OpenProceedings.org

PROLIT: Supporting the Transparency of Data Preparation Pipelines through Narratives over Data Provenance [pdf]

Paper doi link bibtex abstract 1 download

@inproceedings{lazzaro_prolit_2025,
	address = {Barcelona, Spain},
	title = {{PROLIT}: {Supporting} the {Transparency} of {Data} {Preparation} {Pipelines} through {Narratives} over {Data} {Provenance}},
	shorttitle = {{PROLIT}},
	url = {https://openproceedings.org/2025/conf/edbt/paper-336.pdf},
	doi = {10.48786/EDBT.2025.108},
	abstract = {Establishing trust in the models is a long-standing objective in Machine Learning and AI. Information on how data are manipulated before being used for training is instrumental in such understanding, and data provenance can be used to organize and navigate such information. The PROLIT system described in this demo paper is designed to collect, manage, and query the provenance of data as it flows through data preparation pipelines in support of data science analytics and machine learning modeling. PROLIT extends our prior work on transparently collecting data provenance in several directions. Most notably, it employs a LLM to: (i) automatically rewrite user-defined pipelines in a format suitable for this activity, (ii) segment the code to precisely associate provenance to each code snippet, and (iii) provide human-readable descriptions of each snippet that in turn can be used to generate provenance narratives. The demo will showcase these capabilities and offer the opportunity to interact with PROLIT on user-defined as well as pre-defined Python code where dataframes are used as the common data abstraction.},
	language = {en},
	urldate = {2025-04-16},
	booktitle = {Procs. {EDBT} ({Demo} track)},
	publisher = {OpenProceedings.org},
	author = {Lazzaro, Pasquale Leonardo and Lazzaro, Marialaura and Missier, Paolo and Torlone, Riccardo},
	year = {2025},
	keywords = {Database Technology},
	file = {Lazzaro et al. - 2025 - PROLIT Supporting the Transparency of Data Prepar.pdf:/Users/npm65/Zotero/storage/2MABHSSL/Lazzaro et al. - 2025 - PROLIT Supporting the Transparency of Data Prepar.pdf:application/pdf},
}

Stacked Generalization for Overlapping Asymmetric Datasets. McTeer, M.; and Missier, P. In Ordonez, C.; Sperlì, Giancarlo; Masciari, E.; and Bellatreche, L., editor(s), Model and Data Engineering, pages 38–52, Cham, 2025. Springer Nature Switzerland
link bibtex abstract

@InProceedings{10.1007/978-3-031-87719-3_3,
author="McTeer, Matthew
and Missier, Paolo",
editor="Ordonez, Carlos
and Sperl{\`i}, Giancarlo
and Masciari, Elio
and Bellatreche, Ladjel",
title="Stacked Generalization for Overlapping Asymmetric Datasets",
booktitle="Model and Data Engineering",
year="2025",
publisher="Springer Nature Switzerland",
address="Cham",
pages="38--52",
abstract="In the context of training sets for Machine Learning, we use the term Overlapping Asymmetric Datasets (OADs) to refer to a combination of data shapes where a large number of observations (Vertical data, {\$}{\$}{\backslash}mathcal {\{}V{\}}{\$}{\$}V) are described using only few features (x), and a small subset of the observations (Horizontal data, {\$}{\$}{\backslash}mathcal {\{}H{\}}{\$}{\$}H) are described by a larger number of features (x plus some new z). A common example of such a combination is a healthcare dataset where the majority of patients are described using a baseline set of clinical and socio-demographic features, and a handful of those patients have a richer characterisation, having undergone further testing . Given a classification task, a model trained solely on {\$}{\$}{\backslash}mathcal {\{}H{\}}{\$}{\$}Hwill benefit from the many features, but its performance will be limited by a small training set size . In this paper we study the problem of maximising model performance on {\$}{\$}{\backslash}mathcal {\{}H{\}}{\$}{\$}H, by leveraging the additional information available from {\$}{\$}{\backslash}mathcal {\{}V{\}}{\$}{\$}V. Our approach is based on the notions of stacked generalization and meta-learning, where the predictions generated by an ensemble of weak classifiers for {\$}{\$}{\backslash}mathcal {\{}V{\}}{\$}{\$}Vare fed into a second-tier meta-learner, where the z features are also used. We conduct extensive experiments to explore the benefits of this approach over a range of dataset configurations. The results suggest that stacking improves model performance, while using z features only provides modest improvements. This may have practical implications as it suggests that in some settings, the effort involved in acquiring the additional z features is not always justified.",
isbn="978-3-031-87719-3"
}

2024 (10)

Experience: A Comparative Analysis of Multivariate Time-Series Generative Models: A Case Study on Human Activity Data. Alzahrani, N.; Cała, J.; and Missier, P. J. Data and Information Quality, 16(3). October 2024.

Experience: A Comparative Analysis of Multivariate Time-Series Generative Models: A Case Study on Human Activity Data [link]

Paper doi link bibtex abstract

@article{10.1145/3688393,
author = {Alzahrani, Naif and Ca\l{}a, Jacek and Missier, Paolo},
title = {Experience: A Comparative Analysis of Multivariate Time-Series Generative Models: A Case Study on Human Activity Data},
year = {2024},
issue_date = {September 2024},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {16},
number = {3},
issn = {1936-1955},
url = {https://doi.org/10.1145/3688393},
doi = {10.1145/3688393},
abstract = {Human activity recognition (HAR) is an active research field that has seen great success in recent years due to advances in sensory data collection methods and activity recognition systems. Deep artificial intelligence (AI) models have contributed to the success of HAR systems lately, although still suffering from limitations such as data scarcity, the high costs of labelling data instances, and datasets’ imbalance and bias. The temporal nature of human activity data, represented as time series data, impose an additional challenge to using AI models in HAR, because most state-of-the-art models do not account for the time component of the data instances. These limitations have inspired the time-series research community to design generative models for sequential data, but very little work has been done to evaluate the quality of such models. In this work, we conduct a comparative quality analysis of three generative models for time-series data, using a case study in which we aim to generate sensory human activity data from a seed public dataset. Additionally, we adapt and clearly explain four evaluation methods of synthetic time-series data from the literature and apply them to assess the quality of the synthetic activity data we generate. We show experimentally that high-quality human activity data can be generated using deep generative models, and the synthetic data can thus be used in HAR systems to augment real activity data. We also demonstrate that the chosen evaluation methods effectively ensure that the generated data meets the essential quality benchmarks of realism, diversity, coherence, and utility. Our findings suggest that using deep generative models to produce synthetic human activity data can potentially address challenges related to data scarcity, biases, and expensive labeling. This holds promise for enhancing the efficiency and reliability of HAR systems.},
journal = {J. Data and Information Quality},
month = oct,
articleno = {18},
numpages = {18},
keywords = {Human activity recognition, multivariate time series, generative modeling}
}

Probabilistic Modelling of Multiple Long-Term Condition Onset Times. Richards, K.; Fleetwood, K.; Prigge, R.; Missier, P.; Barnes, M.; Reynolds, N. J.; Guthrie, B.; and Seth, S. 2024.

Probabilistic Modelling of Multiple Long-Term Condition Onset Times [link]

Paper link bibtex

@misc{richards2024probabilisticmodellingmultiplelongterm,
      title={Probabilistic Modelling of Multiple Long-Term Condition Onset Times}, 
      author={Kieran Richards and Kelly Fleetwood and Regina Prigge and Paolo Missier and Michael Barnes and Nick J. Reynolds and Bruce Guthrie and Sohan Seth},
      year={2024},
      eprint={2412.07657},
      archivePrefix={arXiv},
      primaryClass={stat.AP},
      url={https://arxiv.org/abs/2412.07657}, 
}

bursty_dynamics: A Python Package for Exploring the Temporal Properties of Longitudinal Data. Angdembe, A.; Iqbal, W. A; Hamad, R. A.; Casement, J.; Consortium, A.; Missier, P.; Reynolds, N.; Henkin, R.; and Barnes, M. R 2024.

bursty_dynamics: A Python Package for Exploring the Temporal Properties of Longitudinal Data [link]

Paper link bibtex

@misc{angdembe2024burstydynamicspythonpackageexploring,
      title={bursty_dynamics: A Python Package for Exploring the Temporal Properties of Longitudinal Data}, 
      author={Alisha Angdembe and Wasim A Iqbal and Rebeen Ali Hamad and John Casement and AI-Multiply Consortium and Paolo Missier and Nick Reynolds and Rafael Henkin and Michael R Barnes},
      year={2024},
      eprint={2411.03210},
      archivePrefix={arXiv},
      primaryClass={q-bio.QM},
      url={https://arxiv.org/abs/2411.03210}, 
}

From why-provenance to why+provenance: Towards addressing deep data explanations in Data-Centric AI. Missier, P.; and Torlone, R. In Atzori, M.; Ciaccia, P.; Ceci, M.; and Mandreoli, F., editor(s), Proceedings of the 32nd Symposium on Advanced Database Systems, volume 3741, pages 508-517, Villasimius, Sardinia, Italy, June 2024. CEUR Workshop Proceedings

From why-provenance to why+provenance: Towards addressing deep data explanations in Data-Centric AI [pdf]

Paper link bibtex 4 downloads

@inproceedings{missier2024provenance,
  author =       {Missier, Paolo and Torlone, Riccardo},
  title =        {From why-provenance to why+provenance: Towards
                  addressing deep data explanations in {Data-Centric}
                  {AI}},
  booktitle =    {Proceedings of the 32nd {Symposium} on {Advanced}
                  {Database} {Systems}},
  year =         2024,
  editor =       {Atzori, Maurizio and Ciaccia, Paolo and Ceci,
                  Michelangelo and Mandreoli, Federica},
  volume =       3741,
  pages =        {508-517},
  month =        jun,
  address =      {Villasimius, Sardinia, Italy},
  publisher =    {CEUR Workshop Proceedings},
  url =          {https://ceur-ws.org/Vol-3741/paper11.pdf},
}

Design and Development of a Provenance Capture Platform for Data Science. Gregori, L.; Missier, P.; Stidolph, M.; Torlone, r.; and Wood, A. In Procs. 3rd DATAPLAT workshop, co-located with ICDE 2024, Utrecht, NL, May 2024. IEEE

Design and Development of a Provenance Capture Platform for Data Science [link]

Paper link bibtex abstract 3 downloads

@inproceedings{gregori_design_2024,
	address = {Utrecht, NL},
	title = {Design and {Development} of a {Provenance} {Capture} {Platform} for {Data} {Science}},
	abstract = {As machine learning and AI systems become more prevalent, understanding how their decisions are made is key to maintaining their trust. To solve this problem, it is widely accepted that fundamental support can be provided by the knowledge of how data are altered in the pre-processing phase, using data provenance to track such changes.
This paper focuses on the design and development of a system for collecting and managing data provenance of data preparation pipelines in data science. An investigation of publicly available machine learning pipelines is conducted to identify the most important features required for the tool to achieve impact on a broad selection of pre-processing data manipulation. This reveals that the operations that are used in practice can be implemented by combining a rather limited set of basic operators. We then illustrate and test implementation choices aimed at supporting the provenance capture for those operations efficiently and with minimal effort for data scientists.},
	booktitle = {Procs. 3rd {DATAPLAT} workshop, co-located with {ICDE} 2024},
	publisher = {IEEE},
	author = {Gregori, Luca and Missier, Paolo and Stidolph, Matthew and Torlone, riccardo and Wood, Alessandro},
	month = may,
	year = {2024},
	url={https://www.dropbox.com/scl/fi/plz8egd5wdvb5bp5vra09/840300a285.pdf?rlkey=gitqo6jzveh915g9fhbsqpqyn&st=8pk9vluh&dl=0}
}

Validity constraints for data analysis workflows. Schintke, F.; Belhajjame, K.; Mecquenem, N. D.; Frantz, D.; Guarino, V. E.; Hilbrich, M.; Lehmann, F.; Missier, P.; Sattler, R.; Sparka, J. A.; Speckhard, D. T.; Stolte, H.; Vu, A. D.; and Leser, U. Future Generation Computer Systems, 157: 82–97. 2024.

Validity constraints for data analysis workflows [link]

Paper doi link bibtex abstract 1 download

@article{schintke_validity_2024,
	title = {Validity constraints for data analysis workflows},
	volume = {157},
	issn = {0167-739X},
	url = {https://www.sciencedirect.com/science/article/pii/S0167739X24001079},
	doi = {https://doi.org/10.1016/j.future.2024.03.037},
	abstract = {Porting a scientific data analysis workflow (DAW) to a cluster infrastructure, a new software stack, or even only a new dataset with some notably different properties is often challenging. Despite the structured definition of the steps (tasks) and their interdependencies during a complex data analysis in the DAW specification, relevant assumptions may remain unspecified and implicit. Such hidden assumptions often lead to crashing tasks without a reasonable error message, poor performance in general, non-terminating executions, or silent wrong results of the DAW, to name only a few possible consequences. Searching for the causes of such errors and drawbacks in a distributed compute cluster managed by a complex infrastructure stack, where DAWs for large datasets typically are executed, can be tedious and time-consuming. We propose validity constraints (VCs) as a new concept for DAW languages to alleviate this situation. A VC is a constraint specifying logical conditions that must be fulfilled at certain times for DAW executions to be valid. When defined together with a DAW, VCs help to improve the portability, adaptability, and reusability of DAWs by making implicit assumptions explicit. Once specified, VCs can be controlled automatically by the DAW infrastructure, and violations can lead to meaningful error messages and graceful behavior (e.g., termination or invocation of repair mechanisms). We provide a broad list of possible VCs, classify them along multiple dimensions, and compare them to similar concepts one can find in related fields. We also provide a proof-of-concept implementation for the workflow system Nextflow.},
	journal = {Future Generation Computer Systems},
	author = {Schintke, Florian and Belhajjame, Khalid and Mecquenem, Ninon De and Frantz, David and Guarino, Vanessa Emanuela and Hilbrich, Marcus and Lehmann, Fabian and Missier, Paolo and Sattler, Rebecca and Sparka, Jan Arne and Speckhard, Daniel T. and Stolte, Hermann and Vu, Anh Duc and Leser, Ulf},
	year = {2024},
	keywords = {Dependability, Integrity and conformance checking, Scientific workflow systems, Validity constraints, Workflow specification languages},
	pages = {82--97},
}

How far back do we need to look to capture diagnoses in electronic health records? A retrospective observational study of hospital electronic health record data. Lewis, J.; Evison, F.; Doal, R.; Field, J.; Gallier, S.; Harris, S.; le Roux, P.; Osman, M.; Plummer, C.; Sapey, E.; Singer, M.; Sayer, A. A; and Witham, M. D BMJ Open, 14(2). 2024.

Paper doi link bibtex abstract

@article {Lewise080678,
	author = {Jadene Lewis and Felicity Evison and Rominique Doal and Joanne Field and Suzy Gallier and Steve Harris and Peta le Roux and Mohammed Osman and Chris Plummer and Elizabeth Sapey and Mervyn Singer and Avan A Sayer and Miles D Witham},
	editor = {, and , and Sayer, Avan A and Bartle, Victoria and Cooper, Rachel and Cordell, Heather J and Holding, Ray and Marshall, Tom and Matthews, Fiona E and Missier, Paolo and Pearson, Ewan and Plummer, Chris and Robinson, Sian and Sapey, Elizabeth and Singer, Mervyn and Scharf, Thomas and Wason, James and Witham, Miles D},
	title = {How far back do we need to look to capture diagnoses in electronic health records? A retrospective observational study of hospital electronic health record data},
	volume = {14},
	number = {2},
	elocation-id = {e080678},
	year = {2024},
	doi = {10.1136/bmjopen-2023-080678},
	publisher = {British Medical Journal Publishing Group},
	abstract = {Objectives Analysis of routinely collected electronic health data is a key tool for long-term condition research and practice for hospitalised patients. This requires accurate and complete ascertainment of a broad range of diagnoses, something not always recorded on an admission document at a single point in time. This study aimed to ascertain how far back in time electronic hospital records need to be interrogated to capture long-term condition diagnoses.Design Retrospective observational study of routinely collected hospital electronic health record data.Setting Queen Elizabeth Hospital Birmingham (UK)-linked data held by the PIONEER acute care data hub.Participants Patients whose first recorded admission for chronic obstructive pulmonary disease (COPD) exacerbation (n=560) or acute stroke (n=2142) was between January and December 2018 and who had a minimum of 10 years of data prior to the index date.Outcome measures We identified the most common International Classification of Diseases version 10-coded diagnoses received by patients with COPD and acute stroke separately. For each diagnosis, we derived the number of patients with the diagnosis recorded at least once over the full 10-year lookback period, and then compared this with shorter lookback periods from 1 year to 9 years prior to the index admission.Results Seven of the top 10 most common diagnoses in the COPD dataset reached \&gt;90\% completeness by 6 years of lookback. Atrial fibrillation and diabetes were \&gt;90\% coded with 2{\textendash}3 years of lookback, but hypertension and asthma completeness continued to rise all the way out to 10 years of lookback. For stroke, 4 of the top 10 reached 90\% completeness by 5 years of lookback; angina pectoris was \&gt;90\% coded at 7 years and previous transient ischaemic attack completeness continued to rise out to 10 years of lookback.Conclusion A 7-year lookback captures most, but not all, common diagnoses. Lookback duration should be tailored to the conditions being studied.Data may be obtained from a third party and are not publicly available. The data that support the findings of this study are not openly available due to reasons of sensitivity. Data may be accessed on request to the HDR-UK PIONEER acute data hub on provision of permission from the PIONEER Data Trust Committee and provision of a data access agreement. Data are located in controlled access data storage at the PIONEER acute data hub.},
	issn = {2044-6055},
	URL = {https://bmjopen.bmj.com/content/14/2/e080678},
	eprint = {https://bmjopen.bmj.com/content/14/2/e080678.full.pdf},
	journal = {BMJ Open}
}

Objectives Analysis of routinely collected electronic health data is a key tool for long-term condition research and practice for hospitalised patients. This requires accurate and complete ascertainment of a broad range of diagnoses, something not always recorded on an admission document at a single point in time. This study aimed to ascertain how far back in time electronic hospital records need to be interrogated to capture long-term condition diagnoses.Design Retrospective observational study of routinely collected hospital electronic health record data.Setting Queen Elizabeth Hospital Birmingham (UK)-linked data held by the PIONEER acute care data hub.Participants Patients whose first recorded admission for chronic obstructive pulmonary disease (COPD) exacerbation (n=560) or acute stroke (n=2142) was between January and December 2018 and who had a minimum of 10 years of data prior to the index date.Outcome measures We identified the most common International Classification of Diseases version 10-coded diagnoses received by patients with COPD and acute stroke separately. For each diagnosis, we derived the number of patients with the diagnosis recorded at least once over the full 10-year lookback period, and then compared this with shorter lookback periods from 1 year to 9 years prior to the index admission.Results Seven of the top 10 most common diagnoses in the COPD dataset reached >90% completeness by 6 years of lookback. Atrial fibrillation and diabetes were >90% coded with 2–3 years of lookback, but hypertension and asthma completeness continued to rise all the way out to 10 years of lookback. For stroke, 4 of the top 10 reached 90% completeness by 5 years of lookback; angina pectoris was >90% coded at 7 years and previous transient ischaemic attack completeness continued to rise out to 10 years of lookback.Conclusion A 7-year lookback captures most, but not all, common diagnoses. Lookback duration should be tailored to the conditions being studied.Data may be obtained from a third party and are not publicly available. The data that support the findings of this study are not openly available due to reasons of sensitivity. Data may be accessed on request to the HDR-UK PIONEER acute data hub on provision of permission from the PIONEER Data Trust Committee and provision of a data access agreement. Data are located in controlled access data storage at the PIONEER acute data hub.

Machine learning approaches to enhance diagnosis and staging of patients with MASLD using routinely available clinical information. McTeer, M.; Applegate, D.; Mesenbrink, P.; Ratziu, V.; Schattenberg, J. M.; Bugianesi, E.; Geier, A.; Romero Gomez, M.; Dufour, J.; Ekstedt, M.; Francque, S.; Yki-Jarvinen, H.; Allison, M.; Valenti, L.; Miele, L.; Pavlides, M.; Cobbold, J.; Papatheodoridis, G.; Holleboom, A. G.; Tiniakos, D.; Brass, C.; Anstee, Q. M.; Missier, P.; and investigators , o. b. o. t. L. C. PLOS ONE, 19(2): 1–17. February 2024. Publisher: Public Library of Science

Machine learning approaches to enhance diagnosis and staging of patients with MASLD using routinely available clinical information [link]

Paper doi link bibtex abstract

@article{mcteer_machine_2024,
	title = {Machine learning approaches to enhance diagnosis and staging of patients with {MASLD} using routinely available clinical information},
	volume = {19},
	url = {https://doi.org/10.1371/journal.pone.0299487},
	doi = {10.1371/journal.pone.0299487},
	abstract = {Aims Metabolic dysfunction Associated Steatotic Liver Disease (MASLD) outcomes such as MASH (metabolic dysfunction associated steatohepatitis), fibrosis and cirrhosis are ordinarily determined by resource-intensive and invasive biopsies. We aim to show that routine clinical tests offer sufficient information to predict these endpoints. Methods Using the LITMUS Metacohort derived from the European NAFLD Registry, the largest MASLD dataset in Europe, we create three combinations of features which vary in degree of procurement including a 19-variable feature set that are attained through a routine clinical appointment or blood test. This data was used to train predictive models using supervised machine learning (ML) algorithm XGBoost, alongside missing imputation technique MICE and class balancing algorithm SMOTE. Shapley Additive exPlanations (SHAP) were added to determine relative importance for each clinical variable. Results Analysing nine biopsy-derived MASLD outcomes of cohort size ranging between 5385 and 6673 subjects, we were able to predict individuals at training set AUCs ranging from 0.719-0.994, including classifying individuals who are At-Risk MASH at an AUC = 0.899. Using two further feature combinations of 26-variables and 35-variables, which included composite scores known to be good indicators for MASLD endpoints and advanced specialist tests, we found predictive performance did not sufficiently improve. We are also able to present local and global explanations for each ML model, offering clinicians interpretability without the expense of worsening predictive performance. Conclusions This study developed a series of ML models of accuracy ranging from 71.9—99.4\% using only easily extractable and readily available information in predicting MASLD outcomes which are usually determined through highly invasive means.},
	number = {2},
	journal = {PLOS ONE},
	author = {McTeer, Matthew and Applegate, Douglas and Mesenbrink, Peter and Ratziu, Vlad and Schattenberg, Jörn M. and Bugianesi, Elisabetta and Geier, Andreas and Romero Gomez, Manuel and Dufour, Jean-Francois and Ekstedt, Mattias and Francque, Sven and Yki-Jarvinen, Hannele and Allison, Michael and Valenti, Luca and Miele, Luca and Pavlides, Michael and Cobbold, Jeremy and Papatheodoridis, Georgios and Holleboom, Adriaan G. and Tiniakos, Dina and Brass, Clifford and Anstee, Quentin M. and Missier, Paolo and investigators, on behalf of the LITMUS Consortium},
	month = feb,
	year = {2024},
	note = {Publisher: Public Library of Science},
	pages = {1--17},
}

Handling Overlapping Asymmetric Data Sets—A Twice Penalized P-Spline Approach. McTeer, M.; Henderson, R.; Anstee, Q. M.; and Missier, P. Mathematics, 12(5). 2024.

Handling Overlapping Asymmetric Data Sets—A Twice Penalized P-Spline Approach [link]

Paper doi link bibtex abstract

@Article{math12050777,
AUTHOR = {McTeer, Matthew and Henderson, Robin and Anstee, Quentin M. and Missier, Paolo},
TITLE = {Handling Overlapping Asymmetric Data Sets—A Twice Penalized P-Spline Approach},
JOURNAL = {Mathematics},
VOLUME = {12},
YEAR = {2024},
NUMBER = {5},
ARTICLE-NUMBER = {777},
URL = {https://www.mdpi.com/2227-7390/12/5/777},
ISSN = {2227-7390},
ABSTRACT = {Aims: Overlapping asymmetric data sets are where a large cohort of observations have a small amount of information recorded, and within this group there exists a smaller cohort which have extensive further information available. Missing imputation is unwise if cohort size differs substantially; therefore, we aim to develop a way of modelling the smaller cohort whilst considering the larger. Methods: Through considering traditionally once penalized P-Spline approximations, we create a second penalty term through observing discrepancies in the marginal value of covariates that exist in both cohorts. Our now twice penalized P-Spline is designed to firstly prevent over/under-fitting of the smaller cohort and secondly to consider the larger cohort. Results: Through a series of data simulations, penalty parameter tunings, and model adaptations, our twice penalized model offers up to a 58% and 46% improvement in model fit upon a continuous and binary response, respectively, against existing B-Spline and once penalized P-Spline methods. Applying our model to an individual’s risk of developing steatohepatitis, we report an over 65% improvement over existing methods. Conclusions: We propose a twice penalized P-Spline method which can vastly improve the model fit of overlapping asymmetric data sets upon a common predictive endpoint, without the need for missing data imputation.},
DOI = {10.3390/math12050777}
}

Supporting Better Insights of Data Science Pipelines with Fine-grained Provenance. Chapman, A.; Lauro, L.; Missier, P.; and Torlone, R. ACM Trans. Database Syst., 49(2). apr 2024.

Supporting Better Insights of Data Science Pipelines with Fine-grained Provenance [link]

Paper doi link bibtex abstract 2 downloads

@article{10.1145/3644385,
author = {Chapman, Adriane and Lauro, Luca and Missier, Paolo and Torlone, Riccardo},
title = {Supporting Better Insights of Data Science Pipelines with Fine-grained Provenance},
year = {2024},
issue_date = {June 2024},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {49},
number = {2},
issn = {0362-5915},
url = {https://doi.org/10.1145/3644385},
doi = {10.1145/3644385},
abstract = {Successful data-driven science requires complex data engineering pipelines to clean, transform, and alter data in preparation for machine learning, and robust results can only be achieved when each step in the pipeline can be justified, and its effect on the data explained. In this framework, we aim at providing data scientists with facilities to gain an in-depth understanding of how each step in the pipeline affects the data, from the raw input to training sets ready to be used for learning. Starting from an extensible set of data preparation operators commonly used within a data science setting, in this work we present a provenance management infrastructure for generating, storing, and querying very granular accounts of data transformations, at the level of individual elements within datasets whenever possible. Then, from the formal definition of a core set of data science preprocessing operators, we derive a provenance semantics embodied by a collection of templates expressed in PROV, a standard model for data provenance. Using those templates as a reference, our provenance generation algorithm generalises to any operator with observable input/output pairs. We provide a prototype implementation of an application-level provenance capture library to produce, in a semi-automatic way, complete provenance documents that account for the entire pipeline. We report on the ability of that reference implementation to capture provenance in real ML benchmark pipelines and over TCP-DI synthetic data. We finally show how the collected provenance can be used to answer a suite of provenance benchmark queries that underpin some common pipeline inspection questions, as expressed on the Data Science Stack Exchange.},
journal = {ACM Trans. Database Syst.},
month = {apr},
articleno = {6},
numpages = {42},
keywords = {Provenance, data science, data preparation, preprocessing}
}

2023 (9)

ConvBoost: Boosting ConvNets for Sensor-based Activity Recognition. Shao, S.; Guan, Y.; Zhai, B.; Missier, P.; and Plötz, T. Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies, 7(2). 2023. Cited by: 1; All Open Access, Bronze Open Access, Green Open Access

ConvBoost: Boosting ConvNets for Sensor-based Activity Recognition [link]

Paper doi link bibtex

@ARTICLE{Shao2023,
	author = {Shao, Shuai and Guan, Yu and Zhai, Bing and Missier, Paolo and Plötz, Thomas},
	title = {ConvBoost: Boosting ConvNets for Sensor-based Activity Recognition},
	year = {2023},
	journal = {Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies},
	volume = {7},
	number = {2},
	doi = {10.1145/3596234},
	url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85162778689&doi=10.1145%2f3596234&partnerID=40&md5=8538b225ee8f88cc17b38cb86f86fdca},
	type = {Article},
	publication_stage = {Final},
	source = {Scopus},
	note = {Cited by: 1; All Open Access, Bronze Open Access, Green Open Access}
}

A Machine Learning Approach to Predict Weight Change in ART-Experienced People Living with HIV. Motta, F.; Milic, J.; Gozzi, L.; Belli, M.; Sighinolfi, L.; Cuomo, G.; Carli, F.; Dolci, G.; Iadisernia, V.; Burastero, G.; Mussini, C.; Missier, P.; Mandreoli, F.; and Guaraldi, G. Journal of Acquired Immune Deficiency Syndromes, 94(5): 474 – 481. 2023. Cited by: 0

A Machine Learning Approach to Predict Weight Change in ART-Experienced People Living with HIV [link]

Paper doi link bibtex

@ARTICLE{Motta2023474,
	author = {Motta, Federico and Milic, Jovana and Gozzi, Licia and Belli, Michela and Sighinolfi, Laura and Cuomo, Gianluca and Carli, Federica and Dolci, Giovanni and Iadisernia, Vittorio and Burastero, Giulia and Mussini, Cristina and Missier, Paolo and Mandreoli, Federica and Guaraldi, Giovanni},
	title = {A Machine Learning Approach to Predict Weight Change in ART-Experienced People Living with HIV},
	year = {2023},
	journal = {Journal of Acquired Immune Deficiency Syndromes},
	volume = {94},
	number = {5},
	pages = {474 – 481},
	doi = {10.1097/QAI.0000000000003302},
	url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85176428031&doi=10.1097%2fQAI.0000000000003302&partnerID=40&md5=08c7bfd07ef1c7cf2321d7e75e16c155},
	type = {Article},
	publication_stage = {Final},
	source = {Scopus},
	note = {Cited by: 0}
}

Interpretable and robust hospital readmission predictions from Electronic Health Records. Calero-Diaz, H.; Hamad, R.; Atallah, C.; Casement, J.; Canoy, D.; Reynolds, N.; Barnes, M.; and Missier, P. In Procs. IEEE BigData, Sorrento, Italy, December 2023. IEEE

Interpretable and robust hospital readmission predictions from Electronic Health Records [link]

Paper link bibtex abstract 3 downloads

@inproceedings{calero-diaz_interpretable_2023,
	address = {Sorrento, Italy},
	title = {Interpretable and robust hospital readmission predictions from {Electronic} {Health} {Records}},
	abstract = {—Rates of Hospital Readmission (HR), defined as
unplanned readmission within 30 days of discharge, have been
increasing over the years, and impose an economic burden on
healthcare services worldwide. Despite recent research into predicting HR, few models provide sufficient discriminative ability.
Three main drawbacks can be identified in the published literature: (i) imbalance in the target classes (readmitted or not), (ii)
not including demographic and lifestyle predictors, and (iii) lack
of interpretability of the models. In this work, we address these
three points by evaluating class balancing techniques, performing
a feature selection process including demographic and lifestyle
features, and adding interpretability through a combination of
SHapley Additive exPlanations (SHAP) and Accumulated Local
Effects (ALE) post hoc methods. Our best classifier for this
binary outcome achieves a UAC of 0.849 using a selection of
1296 features, extracted from patients’ Electronic Health Records
(EHRs) and from their sociodemographics profiles. Using SHAP
and ALE, we have established the importance of age, the
number of long-term conditions, and the duration of the first
admission as top predictors. In addition, we show through an
ablation study that demographic and lifestyle features provide
even better predictive capabilities than other features, suggesting
their relevance toward HR},
	booktitle = {Procs. {IEEE} {BigData}},
	publisher = {IEEE},
	author = {Calero-Diaz, Hugo and Hamad, Rebeen and Atallah, Christian and Casement, John and Canoy, Dexter and Reynolds, Nick and Barnes, Michael and Missier, Paolo},
	month = dec,
	year = {2023},
	url={https://ieeexplore.ieee.org/document/10386820}
}

Ethnic differences in early onset multimorbidity and associations with health service use, long-term prescribing, years of life lost, and mortality: A cross-sectional study using clustering in the UK Clinical Practice Research Datalink. Eto, F.; Samuel, M.; Henkin, R.; Mahesh, M.; Ahmad, T.; Angdembe, A.; McAllister-Williams, R. H.; Missier, P.; Reynolds, N. J.; Barnes, M. R.; Hull, S.; Finer, S.; and Mathur, R. PLOS Medicine, 20(10): e1004300. October 2023. Publisher: Public Library of Science

Paper doi link bibtex abstract 1 download

@article{eto_ethnic_2023,
	title = {Ethnic differences in early onset multimorbidity and associations with health service use, long-term prescribing, years of life lost, and mortality: {A} cross-sectional study using clustering in the {UK} {Clinical} {Practice} {Research} {Datalink}},
	volume = {20},
	issn = {1549-1676},
	shorttitle = {Ethnic differences in early onset multimorbidity and associations with health service use, long-term prescribing, years of life lost, and mortality},
	url = {https://journals.plos.org/plosmedicine/article?id=10.1371/journal.pmed.1004300},
	doi = {10.1371/journal.pmed.1004300},
	abstract = {Background The population prevalence of multimorbidity (the existence of at least 2 or more long-term conditions [LTCs] in an individual) is increasing among young adults, particularly in minority ethnic groups and individuals living in socioeconomically deprived areas. In this study, we applied a data-driven approach to identify clusters of individuals who had an early onset multimorbidity in an ethnically and socioeconomically diverse population. We identified associations between clusters and a range of health outcomes. Methods and findings Using linked primary and secondary care data from the Clinical Practice Research Datalink GOLD (CPRD GOLD), we conducted a cross-sectional study of 837,869 individuals with early onset multimorbidity (aged between 16 and 39 years old when the second LTC was recorded) registered with an English general practice between 2010 and 2020. The study population included 777,906 people of White ethnicity (93\%), 33,915 people of South Asian ethnicity (4\%), and 26,048 people of Black African/Caribbean ethnicity (3\%). A total of 204 LTCs were considered. Latent class analysis stratified by ethnicity identified 4 clusters of multimorbidity in White groups and 3 clusters in South Asian and Black groups. We found that early onset multimorbidity was more common among South Asian (59\%, 33,915) and Black (56\% 26,048) groups compared to the White population (42\%, 777,906). Latent class analysis revealed physical and mental health conditions that were common across all ethnic groups (i.e., hypertension, depression, and painful conditions). However, each ethnic group also presented exclusive LTCs and different sociodemographic profiles: In White groups, the cluster with the highest rates/odds of the outcomes was predominantly male (54\%, 44,150) and more socioeconomically deprived than the cluster with the lowest rates/odds of the outcomes. On the other hand, South Asian and Black groups were more socioeconomically deprived than White groups, with a consistent deprivation gradient across all multimorbidity clusters. At the end of the study, 4\% (34,922) of the White early onset multimorbidity population had died compared to 2\% of the South Asian and Black early onset multimorbidity populations (535 and 570, respectively); however, the latter groups died younger and lost more years of life. The 3 ethnic groups each displayed a cluster of individuals with increased rates of primary care consultations, hospitalisations, long-term prescribing, and odds of mortality. Study limitations include the exclusion of individuals with missing ethnicity information, the age of diagnosis not reflecting the actual age of onset, and the exclusion of people from Mixed, Chinese, and other ethnic groups due to insufficient power to investigate associations between multimorbidity and health-related outcomes in these groups. Conclusions These findings emphasise the need to identify, prevent, and manage multimorbidity early in the life course. Our work provides additional insights into the excess burden of early onset multimorbidity in those from socioeconomically deprived and diverse groups who are disproportionately and more severely affected by multimorbidity and highlights the need to ensure healthcare improvements are equitable.},
	language = {en},
	number = {10},
	urldate = {2023-10-30},
	journal = {PLOS Medicine},
	author = {Eto, Fabiola and Samuel, Miriam and Henkin, Rafael and Mahesh, Meera and Ahmad, Tahania and Angdembe, Alisha and McAllister-Williams, R. Hamish and Missier, Paolo and Reynolds, Nick J. and Barnes, Michael R. and Hull, Sally and Finer, Sarah and Mathur, Rohini},
	month = oct,
	year = {2023},
	note = {Publisher: Public Library of Science},
	keywords = {African people, Death rates, Electronic medical records, Ethnic epidemiology, Ethnicities, Long-term care, Primary care, Socioeconomic aspects of health},
	pages = {e1004300},
	file = {Full Text PDF:/Users/npm65/Zotero/storage/L5NZP6HL/Eto et al. - 2023 - Ethnic differences in early onset multimorbidity a.pdf:application/pdf},
}

Background The population prevalence of multimorbidity (the existence of at least 2 or more long-term conditions [LTCs] in an individual) is increasing among young adults, particularly in minority ethnic groups and individuals living in socioeconomically deprived areas. In this study, we applied a data-driven approach to identify clusters of individuals who had an early onset multimorbidity in an ethnically and socioeconomically diverse population. We identified associations between clusters and a range of health outcomes. Methods and findings Using linked primary and secondary care data from the Clinical Practice Research Datalink GOLD (CPRD GOLD), we conducted a cross-sectional study of 837,869 individuals with early onset multimorbidity (aged between 16 and 39 years old when the second LTC was recorded) registered with an English general practice between 2010 and 2020. The study population included 777,906 people of White ethnicity (93%), 33,915 people of South Asian ethnicity (4%), and 26,048 people of Black African/Caribbean ethnicity (3%). A total of 204 LTCs were considered. Latent class analysis stratified by ethnicity identified 4 clusters of multimorbidity in White groups and 3 clusters in South Asian and Black groups. We found that early onset multimorbidity was more common among South Asian (59%, 33,915) and Black (56% 26,048) groups compared to the White population (42%, 777,906). Latent class analysis revealed physical and mental health conditions that were common across all ethnic groups (i.e., hypertension, depression, and painful conditions). However, each ethnic group also presented exclusive LTCs and different sociodemographic profiles: In White groups, the cluster with the highest rates/odds of the outcomes was predominantly male (54%, 44,150) and more socioeconomically deprived than the cluster with the lowest rates/odds of the outcomes. On the other hand, South Asian and Black groups were more socioeconomically deprived than White groups, with a consistent deprivation gradient across all multimorbidity clusters. At the end of the study, 4% (34,922) of the White early onset multimorbidity population had died compared to 2% of the South Asian and Black early onset multimorbidity populations (535 and 570, respectively); however, the latter groups died younger and lost more years of life. The 3 ethnic groups each displayed a cluster of individuals with increased rates of primary care consultations, hospitalisations, long-term prescribing, and odds of mortality. Study limitations include the exclusion of individuals with missing ethnicity information, the age of diagnosis not reflecting the actual age of onset, and the exclusion of people from Mixed, Chinese, and other ethnic groups due to insufficient power to investigate associations between multimorbidity and health-related outcomes in these groups. Conclusions These findings emphasise the need to identify, prevent, and manage multimorbidity early in the life course. Our work provides additional insights into the excess burden of early onset multimorbidity in those from socioeconomically deprived and diverse groups who are disproportionately and more severely affected by multimorbidity and highlights the need to ensure healthcare improvements are equitable.

Fair and Private Data Preprocessing through Microaggregation. González-Zelaya, V.; Salas, J.; Megías, D.; and Missier, P. ACM Trans. Knowl. Discov. Data, 18(3). dec 2023.

Fair and Private Data Preprocessing through Microaggregation [link]

Paper doi link bibtex abstract 1 download

@article{10.1145/3617377,
author = {Gonz\'{a}lez-Zelaya, Vladimiro and Salas, Juli\'{a}n and Meg\'{\i}as, David and Missier, Paolo},
title = {Fair and Private Data Preprocessing through Microaggregation},
year = {2023},
issue_date = {April 2024},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {18},
number = {3},
issn = {1556-4681},
url = {https://doi.org/10.1145/3617377},
doi = {10.1145/3617377},
abstract = {Privacy protection for personal data and fairness in automated decisions are fundamental requirements for responsible Machine Learning. Both may be enforced through data preprocessing and share a common target: data should remain useful for a task, while becoming uninformative of the sensitive information. The intrinsic connection between privacy and fairness implies that modifications performed to guarantee one of these goals, may have an effect on the other, e.g., hiding a sensitive attribute from a classification algorithm might prevent a biased decision rule having such attribute as a criterion. This work resides at the intersection of algorithmic fairness and privacy. We show how the two goals are compatible, and may be simultaneously achieved, with a small loss in predictive performance. Our results are competitive with both state-of-the-art fairness correcting algorithms and hybrid privacy-fairness methods. Experiments were performed on three widely used benchmark datasets: Adult Income, COMPAS, and German Credit.},
journal = {ACM Trans. Knowl. Discov. Data},
month = {dec},
articleno = {49},
numpages = {24},
keywords = {ethical AI, privacy preserving data mining, algorithmic fairness, Responsible machine learning, fair classification}
}

Mapping inpatient care pathways for patients with COPD: an observational study using routinely collected electronic hospital record data. Evison, F.; Cooper, R.; Gallier, S.; Missier, P.; Sayer, A. A.; Sapey, E.; and Witham, M. D. ERJ open research, 9(5): 00110–2023. September 2023. Place: England
doi link bibtex abstract

@article{evison_mapping_2023,
	title = {Mapping inpatient care pathways for patients with {COPD}: an observational study using routinely collected electronic hospital record data.},
	volume = {9},
	copyright = {Copyright ©The authors 2023.},
	issn = {2312-0541},
	doi = {10.1183/23120541.00110-2023},
	abstract = {INTRODUCTION: Respiratory specialist ward care is associated with better outcomes for patients with COPD exacerbations. We assessed patient pathways and associated  factors for people admitted to hospital with COPD exacerbations. METHODS: We  analysed routinely collected electronic health data for patients admitted with  COPD exacerbation in 2018 to Queen Elizabeth Hospital, Birmingham, UK. We  extracted data on demographics, deprivation index, Elixhauser comorbidities, ward  moves, length of stay, and in-hospital and 1-year mortality. We compared care  pathways with recommended care pathways (transition from initial assessment area  to respiratory wards or discharge). We used Markov state transition models to  derive probabilities of following recommended pathways for patient subgroups.  RESULTS: Of 42 555 patients with unplanned admissions during 2018, 571 patients  were admitted at least once with an exacerbation of COPD. The mean±sd age was  51±11 years; 313 (55\%) were women, 337 (59\%) lived in the most deprived  neighbourhoods and 45 (9\%) were from non-white ethnic backgrounds. 428 (75.0\%)  had ≥4 comorbidities. Age {\textgreater}70 years was associated with higher in-hospital and  1-year mortality, more places of care (wards) and longer length of stay; having  ≥4 comorbidities was associated with higher mortality and longer length of stay.  Older age was associated with a significantly lower probability of following a  recommended pathway ({\textgreater}70 years: 0.514, 95\% CI 0.458-0.571; ≤70 years: 0.636, 95\%  CI 0.572-0.696; p=0.004). CONCLUSIONS: Only older age was associated with a lower  chance of following recommended hospital pathways of care. Such analyses could  help refine appropriate care pathways for patients with COPD exacerbations.},
	language = {eng},
	number = {5},
	journal = {ERJ open research},
	author = {Evison, Felicity and Cooper, Rachel and Gallier, Suzy and Missier, Paolo and Sayer, Avan A. and Sapey, Elizabeth and Witham, Miles D.},
	month = sep,
	year = {2023},
	pmid = {37850214},
	pmcid = {PMC10577591},
	note = {Place: England},
	pages = {00110--2023},
}

Preprocessing Matters: Automated Pipeline Selection for Fair Classification. González-Zelaya, V.; Salas, J.; Prangle, D.; and Missier, P. In Modeling Decisions for Artificial Intelligence: 20th International Conference, MDAI 2023, Umeå, Sweden, June 19–22, 2023, Proceedings, pages 202–213, 2023. Springer
link bibtex

@inproceedings{gonzalez2023preprocessing,
  title={Preprocessing Matters: Automated Pipeline Selection for Fair Classification},
  author={Gonz{\'a}lez-Zelaya, Vladimiro and Salas, Juli{\'a}n and Prangle, Dennis and Missier, Paolo},
  booktitle={Modeling Decisions for Artificial Intelligence: 20th International Conference, MDAI 2023, Ume{\aa}, Sweden, June 19--22, 2023, Proceedings},
  pages={202--213},
  year={2023},
  organization={Springer}
}

Researching multimorbidity in hospital: can we deliver on the promise of health informatics?. Witham, M. D.; Cooper, R.; Missier, P.; Robinson, S. M.; Sapey, E.; and Sayer, A. A. European Geriatric Medicine. May 2023.

Researching multimorbidity in hospital: can we deliver on the promise of health informatics? [link]

Paper doi link bibtex

@article{witham_researching_2023,
	title = {Researching multimorbidity in hospital: can we deliver on the promise of health informatics?},
	issn = {1878-7657},
	url = {https://doi.org/10.1007/s41999-023-00753-6},
	doi = {10.1007/s41999-023-00753-6},
	journal = {European Geriatric Medicine},
	author = {Witham, Miles D. and Cooper, Rachel and Missier, Paolo and Robinson, Sian M. and Sapey, Elizabeth and Sayer, Avan A.},
	month = may,
	year = {2023},
}

On Training Strategies for LSTMs in Sensor-Based Human Activity Recognition. Shao, S.; Guan, Y.; Xin, G.; Missier, P.; and Ploetz, T. In Procs PerCom 2023, Atlanta, USA, 2023.
link bibtex

@inproceedings{shao_training_2023,
	address = {Atlanta, USA},
	title = {On {Training} {Strategies} for {LSTMs} in {Sensor}-{Based} {Human} {Activity} {Recognition}},
	booktitle = {Procs {PerCom} 2023},
	author = {Shao, Shuai and Guan, Yu and Xin, Guan and Missier, Paolo and Ploetz, Thomas},
	year = {2023},
}

2022 (7)

Tracking trajectories of multiple long-term conditions using dynamic patient-cluster associations. Kremer, R.; Raza, S. M.; Eto, F.; Casement, J.; Atallah, C.; Finer, S.; Lendrem, D.; Barnes, M.; Reynolds, N. J; and Missier, P. In 2022 IEEE International Conference on Big Data (Big Data), pages 4390–4399, December 2022.
doi link bibtex abstract 1 download

@inproceedings{kremer_tracking_2022,
	title = {Tracking trajectories of multiple long-term conditions using dynamic patient-cluster associations},
	doi = {10.1109/BigData55660.2022.10021034},
	abstract = {Momentum has been growing into research to better understand the dynamics of multiple long-term conditions – multimorbidity (MLTC-M), defined as the co-occurrence of two or more long-term or chronic conditions within an individual. Several research efforts make use of Electronic Health Records (EHR), which represent patients’ medical histories. These range from discovering patterns of multimorbidity, namely by clustering diseases based on their co-occurrence in EHRs, to using EHRs to predict the next disease or other specific outcomes. One problem with the former approach is that it discards important temporal information on the co-occurrence, while the latter requires "big" data volumes that are not always available from routinely collected EHRs, limiting the robustness of the resulting models.In this paper we take an intermediate approach, where initially we use about 143,000 EHRs from UK Biobank to perform time-independent clustering using topic modelling, and Latent Dirichlet Allocation specifically. We then propose a metric to measure how strongly a patient is "attracted" into any given cluster at any point through their medical history. By tracking how such gravitational pull changes over time, we may then be able to narrow the scope for potential interventions and preventative measures to specific clusters, without having to resort to full-fledged predictive modelling.In this preliminary work we show exemplars of these dynamic associations, which suggest that further exploration may lead to actionable insights into patients’ medical trajectories.},
	booktitle = {2022 {IEEE} {International} {Conference} on {Big} {Data} ({Big} {Data})},
	author = {Kremer, Ron and Raza, Syed Mohib and Eto, Fabiola and Casement, John and Atallah, Christian and Finer, Sarah and Lendrem, Dennis and Barnes, Michael and Reynolds, Nick J and Missier, Paolo},
	month = dec,
	year = {2022},
	keywords = {Big Data, electronic health records, Biological system modeling, Time measurement, Predictive models, Data models, Trajectory, MLTC-M, multi-morbidity, Robustness, topic modelling},
	pages = {4390--4399},
	file = {IEEE Xplore Abstract Record:/Users/npm65/Zotero/storage/XWS3D52X/10021034.html:text/html},
}

Real-world data mining meets clinical practice: Research challenges and perspective. Mandreoli, F.; Ferrari, D.; Guidetti, V.; Motta, F.; and Missier, P. Frontiers in Big Data, 5. 2022.

Real-world data mining meets clinical practice: Research challenges and perspective [link]

Paper doi link bibtex abstract

@ARTICLE{10.3389/fdata.2022.1021621,
AUTHOR={Mandreoli, Federica and Ferrari, Davide and Guidetti, Veronica and Motta, Federico and Missier, Paolo},   
TITLE={Real-world data mining meets clinical practice: Research challenges and perspective},      
JOURNAL={Frontiers in Big Data},      
VOLUME={5},           
YEAR={2022},      
URL={https://www.frontiersin.org/articles/10.3389/fdata.2022.1021621},   
DOI={10.3389/fdata.2022.1021621},    	
ISSN={2624-909X},     
ABSTRACT={As Big Data Analysis meets healthcare applications, domain-specific challenges and opportunities materialize in all aspects of data science. Advanced statistical methods and Artificial Intelligence (AI) on Electronic Health Records (EHRs) are used both for knowledge discovery purposes and clinical decision support. Such techniques enable the emerging Predictive, Preventative, Personalized, and Participatory Medicine (P4M) paradigm. Working with the Infectious Disease Clinic of the University Hospital of Modena, Italy, we have developed a range of Data–Driven (DD) approaches to solve critical clinical applications using statistics, Machine Learning (ML) and Big Data Analytics on real-world EHR. Here, we describe our perspective on the challenges we encountered. Some are connected to medical data and their sparse, scarce, and unbalanced nature. Others are bound to the application environment, as medical AI tools can affect people's health and life. For each of these problems, we report some available techniques to tackle them, present examples drawn from our experience, and propose which approaches, in our opinion, could lead to successful real-world, end-to-end implementations.}
}

DPDS: Assisting Data Science with Data Provenance. Chapman, A.; Missier, P.; Lauro, L.; and Torlone, R. PVLDB, 15(12): 3614 – 3617. 2022.

DPDS: Assisting Data Science with Data Provenance [pdf]

Paper doi link bibtex abstract 7 downloads

@article{chapman_dpds_2022,
	title = {{DPDS}: {Assisting} {Data} {Science} with {Data} {Provenance}},
	volume = {15},
	url = {https://vldb.org/pvldb/vol15/p3614-torlone.pdf},
	doi = {10.14778/3554821.3554857},
	abstract = {Successful data-driven science requires a complex combination of data engineering pipelines and data modelling techniques. Robust and defensible results can only be achieved when each step in the pipeline that is designed to clean, transform and alter data in preparation for data modelling can be justified, and its effect on the data explained. The DPDS toolkit presented in this paper is designed to make such justification and explanation process an integral part of data science practice, adding value while remaining as un-intrusive as possible to the analyst. Catering to the broad community of python/pandas data engineers, DPDS implements an observer pattern that is able to capture the fine-grained provenance associated with each individual element of a dataframe, across multiple transformation steps. The resulting provenance graph is stored in Neo4j and queried through a UI, with the goal of helping engineers and analysts to justify and explain their choice of data operations, from raw data to model training, by highlighting the details of the changes through each transformation.},
	language = {en},
	number = {12},
	journal = {PVLDB},
	author = {Chapman, Adriane and Missier, Paolo and Lauro, Luca and Torlone, Riccardo},
	year = {2022},
	pages = {3614 -- 3617},
	file = {Chapman et al. - DPDS Assisting Data Science with Data Provenance.pdf:/Users/npm65/Zotero/storage/JKRQ89HE/Chapman et al. - DPDS Assisting Data Science with Data Provenance.pdf:application/pdf},
}

Data-Driven, AI-Based Clinical Practice: Experiences, Challenges, and Research Directions. Ferrari, D.; Mandreoli, F.; Motta, F.; and Missier, P. In Amato, G.; Bartalesi, V.; Bianchini, D.; Gennaro, C.; and Torlone, R., editor(s), Proceedings of the 30th Italian Symposium on Advanced Database Systems, SEBD 2022, Tirrenia (PI), Italy, June 19-22, 2022, volume 3194, of CEUR Workshop Proceedings, pages 392–403, 2022. CEUR-WS.org

Data-Driven, AI-Based Clinical Practice: Experiences, Challenges, and Research Directions [pdf]

Paper link bibtex

@inproceedings{DBLP:conf/sebd/FerrariMMM22,
  author    = {Davide Ferrari and
               Federica Mandreoli and
               Federico Motta and
               Paolo Missier},
  editor    = {Giuseppe Amato and
               Valentina Bartalesi and
               Devis Bianchini and
               Claudio Gennaro and
               Riccardo Torlone},
  title     = {Data-Driven, AI-Based Clinical Practice: Experiences, Challenges,
               and Research Directions},
  booktitle = {Proceedings of the 30th Italian Symposium on Advanced Database Systems,
               {SEBD} 2022, Tirrenia (PI), Italy, June 19-22, 2022},
  series    = {{CEUR} Workshop Proceedings},
  volume    = {3194},
  pages     = {392--403},
  publisher = {CEUR-WS.org},
  year      = {2022},
  url       = {http://ceur-ws.org/Vol-3194/paper47.pdf},
  timestamp = {Wed, 24 Aug 2022 09:26:05 +0200},
  biburl    = {https://dblp.org/rec/conf/sebd/FerrariMMM22.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

Curating a longitudinal research resource using linked primary care EHR data - a UK Biobank case study. Darke, P.; Cassidy, S.; Catt, M.; Taylor, R.; Missier, P.; and Bacardit, J. J. Am. Medical Informatics Assoc., 29(3): 546–552. 2022.

Curating a longitudinal research resource using linked primary care EHR data - a UK Biobank case study [link]

Paper doi link bibtex

@article{DBLP:journals/jamia/DarkeCCTMB22,
  author    = {Philip Darke and
               Sophie Cassidy and
               Michael Catt and
               Roy Taylor and
               Paolo Missier and
               Jaume Bacardit},
  title     = {Curating a longitudinal research resource using linked primary care
               {EHR} data - a {UK} Biobank case study},
  journal   = {J. Am. Medical Informatics Assoc.},
  volume    = {29},
  number    = {3},
  pages     = {546--552},
  year      = {2022},
  url       = {https://doi.org/10.1093/jamia/ocab260},
  doi       = {10.1093/jamia/ocab260},
  timestamp = {Wed, 23 Feb 2022 11:16:49 +0100},
  biburl    = {https://dblp.org/rec/journals/jamia/DarkeCCTMB22.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

The interplay of post-acute COVID-19 syndrome and aging: a biological, clinical and public health approach. Guaraldi, G.; Milic, J.; Cesari, M.; Leibovici, L.; Mandreoli, F.; Missier, P.; Rozzini, R.; Cattelan, A. M.; Motta, F.; Mussini, C.; and Cossarizza, A. Ageing Research Reviews,101686. 2022.

The interplay of post-acute COVID-19 syndrome and aging: a biological, clinical and public health approach [link]

Paper doi link bibtex abstract 1 download

@article{guaraldi_interplay_2022,
	title = {The interplay of post-acute {COVID}-19 syndrome and aging: a biological, clinical and public health approach},
	issn = {1568-1637},
	url = {https://www.sciencedirect.com/science/article/pii/S1568163722001283},
	doi = {https://doi.org/10.1016/j.arr.2022.101686},
	abstract = {ABSTRACT The post-acute COVID-19 syndrome (PACS) is characterized by the persistence of fluctuating symptoms over three months from the onset of the possible or confirmed COVID-19 acute phase. Current data suggests that at least 10\% of people with previously documented infection may develop PACS, and up to 50–80\% of prevalence is reported among survivors after hospital discharge. This viewpoint will discuss various aspects of PACS, particularly in older adults, with a specific hypothesis to describe PACS as the expression of a modified aging trajectory induced by SARS CoV-2. This hypothesis will be argued from biological, clinical and public health view, addressing three main questions: (i) does SARS-CoV-2-induced alterations in aging trajectories play a role in PACS?; (ii) do people with PACS face immuno-metabolic derangements that lead to increased susceptibility to age-related diseases?; (iii) is it possible to restore the healthy aging trajectory followed by the individual before pre-COVID?. A particular focus will be given to the well-being of people with PACS that could be assessed by the intrinsic capacity model and support the definition of the healthy aging trajectory.},
	journal = {Ageing Research Reviews},
	author = {Guaraldi, Giovanni and Milic, Jovana and Cesari, Matteo and Leibovici, Leonard and Mandreoli, Federica and Missier, Paolo and Rozzini, Renzo and Cattelan, Anna Maria and Motta, Federico and Mussini, Cristina and Cossarizza, Andrea},
	year = {2022},
	pages = {101686},
}

Knowledge-Driven Data Ecosystems Toward Data Transparency. Geisler, S.; Vidal, M.; Cappiello, C.; Lóscio, B. F.; Gal, A.; Jarke, M.; Lenzerini, M.; Missier, P.; Otto, B.; Paja, E.; Pernici, B.; and Rehof, J. ACM J. Data Inf. Qual., 14(1): 3:1–3:12. 2022.

Knowledge-Driven Data Ecosystems Toward Data Transparency [link]

Paper doi link bibtex 1 download

@article{DBLP:journals/jdiq/GeislerVCLGJLMO22,
  author    = {Sandra Geisler and
               Maria{-}Esther Vidal and
               Cinzia Cappiello and
               Bernadette Farias L{\'{o}}scio and
               Avigdor Gal and
               Matthias Jarke and
               Maurizio Lenzerini and
               Paolo Missier and
               Boris Otto and
               Elda Paja and
               Barbara Pernici and
               Jakob Rehof},
  title     = {Knowledge-Driven Data Ecosystems Toward Data Transparency},
  journal   = {{ACM} J. Data Inf. Qual.},
  volume    = {14},
  number    = {1},
  pages     = {3:1--3:12},
  year      = {2022},
  url       = {https://doi.org/10.1145/3467022},
  doi       = {10.1145/3467022},
  timestamp = {Sat, 09 Apr 2022 12:27:16 +0200},
  biburl    = {https://dblp.org/rec/journals/jdiq/GeislerVCLGJLMO22.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

2021 (7)

An HMM–ensemble approach to predict severity progression of ICU treatment for hospitalized COVID–19 patients. Mandreoli, F.; Motta, F.; and Missier, P. In 2021 20th IEEE International Conference on Machine Learning and Applications (ICMLA), pages 1299-1306, 2021.
doi link bibtex 1 download

@INPROCEEDINGS{9680211,  author={Mandreoli, Federica and Motta, Federico and Missier, Paolo},  booktitle={2021 20th IEEE International Conference on Machine Learning and Applications (ICMLA)},   title={An HMM–ensemble approach to predict severity progression of ICU treatment for hospitalized COVID–19 patients},   year={2021},  volume={},  number={},  pages={1299-1306},  doi={10.1109/ICMLA52953.2021.00211}}

The Right (Provenance) Hammer for the Job: A Comparison of Data Provenance Instrumentation. Chapman, A.; Sasikant, A.; Simonelli, G.; Missier, P.; and Torlone, R. In Provenance in Data Science: From Data Models to Context-Aware Knowledge Graphs. 2021.

The Right (Provenance) Hammer for the Job: A Comparison of Data Provenance Instrumentation [link]

Paper link bibtex 2 downloads

@incollection{chapman_right_2021,
	title = {The {Right} ({Provenance}) {Hammer} for the {Job}: {A} {Comparison} of {Data} {Provenance} {Instrumentation}},
	url = {https://doi.org/10.1007/978-3-030-67681-0_3},
	booktitle = {Provenance in {Data} {Science}: {From} {Data} {Models} to {Context}-{Aware} {Knowledge} {Graphs}},
	author = {Chapman, Adriane and Sasikant, Abhirami and Simonelli, Giulia and Missier, Paolo and Torlone, Riccardo},
	year = {2021},
}

Optimising Fairness Through Parametrised Data Sampling. González-Zelaya, V.; Salas, J.; Prangle, D.; and Missier, P. In Velegrakis, Y.; Zeinalipour-Yazti, D.; Chrysanthis, P. K.; and Guerra, F., editor(s), Proceedings of the 24th International Conference on Extending Database Technology, EDBT 2021, Nicosia, Cyprus, March 23 - 26, 2021, pages 445–450, 2021. OpenProceedings.org

Optimising Fairness Through Parametrised Data Sampling [link]

Paper doi link bibtex 2 downloads

@inproceedings{gonzalez-zelaya_optimising_2021,
	title = {Optimising {Fairness} {Through} {Parametrised} {Data} {Sampling}},
	url = {https://doi.org/10.5441/002/edbt.2021.49},
	doi = {10.5441/002/edbt.2021.49},
	booktitle = {Proceedings of the 24th {International} {Conference} on {Extending} {Database} {Technology}, {EDBT} 2021, {Nicosia}, {Cyprus}, {March} 23 - 26, 2021},
	publisher = {OpenProceedings.org},
	author = {González-Zelaya, Vladimiro and Salas, Julián and Prangle, Dennis and Missier, Paolo},
	editor = {Velegrakis, Yannis and Zeinalipour-Yazti, Demetris and Chrysanthis, Panos K. and Guerra, Francesco},
	year = {2021},
	pages = {445--450},
}

Using wearable activity trackers to predict Type-2 Diabetes: A machine learning-based cross-sectional study of the UK Biobank accelerometer cohort. Lam, B; Catt, M; Cassidy, S; Bacardit, J; Darke, P; Butterfield, S; Alshabrawy, O; Trenell, M; and Missier, P JMIR Diabetes. January 2021.

Paper doi link bibtex 2 downloads

@article{lam_using_2021,
	title = {Using wearable activity trackers to predict {Type}-2 {Diabetes}: {A} machine learning-based cross-sectional study of the {UK} {Biobank} accelerometer cohort},
	url = {https://preprints.jmir.org/preprint/23364},
	doi = {10.2196/23364},
	journal = {JMIR Diabetes},
	author = {Lam, B and Catt, M and Cassidy, S and Bacardit, J and Darke, P and Butterfield, S and Alshabrawy, O and Trenell, M and Missier, P},
	month = jan,
	year = {2021},
}

Capturing and Querying Fine-grained Provenance of Preprocessing Pipelines in Data Science. Chapman, A.; Missier, P.; Simonelli, G.; and Torlone, R. PVLDB, 14(4): 507–520. January 2021.

Capturing and Querying Fine-grained Provenance of Preprocessing Pipelines in Data Science [pdf]

Paper doi link bibtex 8 downloads

@article{chapman_capturing_2021,
	title = {Capturing and {Querying} {Fine}-grained {Provenance} of {Preprocessing} {Pipelines} in {Data} {Science}},
	volume = {14},
	url = {http://www.vldb.org/pvldb/vol14/p507-chapman.pdf},
	doi = {10.14778/3436905.3436911},
	number = {4},
	journal = {PVLDB},
	author = {Chapman, Adriane and Missier, Paolo and Simonelli, Giulia and Torlone, Riccardo},
	month = jan,
	year = {2021},
	pages = {507--520},
}

A customisable pipeline for the semi-automated discovery of online activists and social campaigns on Twitter. Primo, F.; Romanovsky, A.; de Mello, R.; Garcia, A.; and Missier, P. WWW Journal, in press. 2021.
link bibtex abstract

@article{primo_customisable_2021,
	title = {A customisable pipeline for the semi-automated discovery of online activists and social campaigns on {Twitter}},
	volume = {in press},
	abstract = {Substantial research is available on detecting {\textbackslash}textbackslashtextit\{influencers\} on social media platforms. In contrast, comparatively few studies exists on the role of {\textbackslash}textbackslashtextit\{online activists\}, defined informally as users who actively participate in socially-minded online campaigns. Automatically discovering activists who can potentially be approached by organisations that promote social campaigns is important, but not easy, as they are typically active only locally, and, unlike influencers, they are not central to large social media networks. We make the hypothesis that such interesting users can be found on Twitter within temporally and spatially localised {\textbackslash}textbackslashtextit\{contexts\}. We define these as small but topical fragments of the network, containing interactions about social events or campaigns with a significant online footprint. To explore this hypothesis, we have designed an iterative discovery pipeline consisting of two alternating phases of user discovery and context discovery. Multiple iterations of the pipeline result in a growing dataset of user profiles for activists, as well as growing set of online social contexts. This mode of exploration differs significantly from prior techniques that focus on influencers, and presents unique challenges because of the weak online signal available to detect activists. The paper describes the design and implementation of the pipeline as a customisable software framework, where user-defined operational definitions of online activism can be explored. We present an empirical evaluation on two extensive case studies, one concerning healthcare-related campaigns in the UK during 2018, the other related to online activism in Italy during the COVID-19 pandemic.},
	journal = {WWW Journal},
	author = {Primo, Flavio and Romanovsky, Alexander and de Mello, Rafael and Garcia, Alessandro and Missier, Paolo},
	year = {2021},
}

Latency of Trading Transactions in Brokered IoT Data Marketplace in Ethereum. Bajoudah, S.; and Missier, P. In 2021 IEEE SmartWorld, Ubiquitous Intelligence Computing, Advanced Trusted Computing, Scalable Computing Communications, Internet of People and Smart City Innovation (SmartWorld/SCALCOM/UIC/ATC/IOP/SCI), pages 254–263, 2021.
doi link bibtex 1 download

@inproceedings{bajoudah_latency_2021,
	title = {Latency of {Trading} {Transactions} in {Brokered} {IoT} {Data} {Marketplace} in {Ethereum}},
	doi = {10.1109/SWC50871.2021.00043},
	booktitle = {2021 {IEEE} {SmartWorld}, {Ubiquitous} {Intelligence} {Computing}, {Advanced} {Trusted} {Computing}, {Scalable} {Computing} {Communications}, {Internet} of {People} and {Smart} {City} {Innovation} ({SmartWorld}/{SCALCOM}/{UIC}/{ATC}/{IOP}/{SCI})},
	author = {Bajoudah, Shaimaa and Missier, Paolo},
	year = {2021},
	pages = {254--263},
}

2020 (4)

Machine learning in predicting respiratory failure in patients with COVID-19 pneumonia?Challenges, strengths, and opportunities in a global health emergency. Ferrari, D.; Milic, J.; Tonelli, R.; Ghinelli, F.; Meschiari, M.; Volpi, S.; Faltoni, M.; Franceschi, G.; Iadisernia, V.; Yaacoub, D.; Ciusa, G.; Bacca, E.; Rogati, C.; Tutone, M.; Burastero, G.; Raimondi, A.; Menozzi, M.; Franceschini, E.; Cuomo, G.; Corradi, L.; Orlando, G.; Santoro, A.; Digaetano, M.; Puzzolante, C.; Carli, F.; Borghi, V.; Bedini, A.; Fantini, R.; Tabb�, L.; Castaniere, I.; Busani, S.; Clini, E.; Girardis, M.; Sarti, M.; Cossarizza, A.; Mussini, C.; Mandreoli, F.; Missier, P.; and Guaraldi, G. PLOS ONE, 15(11): 1–14. 2020.

Paper doi link bibtex abstract 6 downloads

@article{ferrari_machine_2020,
	title = {Machine learning in predicting respiratory failure in patients with {COVID}-19 pneumonia?{Challenges}, strengths, and opportunities in a global health emergency},
	volume = {15},
	url = {https://doi.org/10.1371/journal.pone.0239172},
	doi = {10.1371/journal.pone.0239172},
	abstract = {Aims The aim of this study was to estimate a 48 hour prediction of moderate to severe respiratory failure, requiring mechanical ventilation, in hospitalized patients with COVID-19 pneumonia. Methods This was an observational prospective study that comprised consecutive patients with COVID-19 pneumonia admitted to hospital from 21 February to 6 April 2020. The patients? medical history, demographic, epidemiologic and clinical data were collected in an electronic patient chart. The dataset was used to train predictive models using an established machine learning framework leveraging a hybrid approach where clinical expertise is applied alongside a data-driven analysis. The study outcome was the onset of moderate to severe respiratory failure defined as PaO2/FiO2 ratio {\textbackslash}textless150 mmHg in at least one of two consecutive arterial blood gas analyses in the following 48 hours. Shapley Additive exPlanations values were used to quantify the positive or negative impact of each variable included in each model on the predicted outcome. Results A total of 198 patients contributed to generate 1068 usable observations which allowed to build 3 predictive models based respectively on 31-variables signs and symptoms, 39-variables laboratory biomarkers and 91-variables as a composition of the two. A fourth ?boosted mixed model? included 20 variables was selected from the model 3, achieved the best predictive performance (AUC = 0.84) without worsening the FN rate. Its clinical performance was applied in a narrative case report as an example. Conclusion This study developed a machine model with 84\% prediction accuracy, which is able to assist clinicians in decision making process and contribute to develop new analytics to improve care at high technology readiness levels.},
	number = {11},
	journal = {PLOS ONE},
	author = {Ferrari, Davide and Milic, Jovana and Tonelli, Roberto and Ghinelli, Francesco and Meschiari, Marianna and Volpi, Sara and Faltoni, Matteo and Franceschi, Giacomo and Iadisernia, Vittorio and Yaacoub, Dina and Ciusa, Giacomo and Bacca, Erica and Rogati, Carlotta and Tutone, Marco and Burastero, Giulia and Raimondi, Alessandro and Menozzi, Marianna and Franceschini, Erica and Cuomo, Gianluca and Corradi, Luca and Orlando, Gabriella and Santoro, Antonella and Digaetano, Margherita and Puzzolante, Cinzia and Carli, Federica and Borghi, Vanni and Bedini, Andrea and Fantini, Riccardo and Tabb�, Luca and Castaniere, Ivana and Busani, Stefano and Clini, Enrico and Girardis, Massimo and Sarti, Mario and Cossarizza, Andrea and Mussini, Cristina and Mandreoli, Federica and Missier, Paolo and Guaraldi, Giovanni},
	year = {2020},
	pages = {1--14},
}

Predicting respiratory failure in patients with COVID-19 pneumonia: a case study from Northern Italy. Ferrari, D.; Mandreoli, F.; Guaraldi, G.; and Missier, P. In The HELPLINE workshop, co-located with the 24th European Conference on AI (ECAI2020), Online!, 2020. CEUR-WS
link bibtex abstract 2 downloads

@inproceedings{ferrari_predicting_2020,
	address = {Online!},
	title = {Predicting respiratory failure in patients with {COVID}-19 pneumonia: a case study from {Northern} {Italy}},
	abstract = {The Covid-19 crisis caught health care services around the world by surprise, putting unprecedented pressure on Intensive Care Units (ICU). To help clinical staff to manage the limited ICU capacity, we have developed a Machine Learning model to estimate the probability that a patient admitted to hospital with COVID-19 symptoms would develop severe respiratory failure and require Intensive Care within 48 hours of admission. The model was trained on an initial co-hort of 198 patients admitted to the Infectious Disease ward of Mod-ena University Hospital, in Italy, at the peak of the epidemic, and subsequently refined as more patients were admitted. Using the Light-GBM Decision Tree ensemble approach, we were able to achieve good accuracy (AUC = 0.84) despite a high rate of missing values. Furthermore, we have been able to provide clinicians with explanations in the form of personalised ranked lists of features for each prediction , using only 20 out of more than 90 variables, using Shapley values to describe the importance of each feature.},
	booktitle = {The {HELPLINE} workshop, co-located with the 24th {European} {Conference} on {AI} ({ECAI2020})},
	publisher = {CEUR-WS},
	author = {Ferrari, Davide and Mandreoli, Federica and Guaraldi, Giovanni and Missier, Paolo},
	year = {2020},
	keywords = {\#covid, \#machine learning},
}

Data-driven vs knowledge-driven inference of health outcomes in the ageing population: a case study. Ferrari, D; Guaraldi, G; Mandreoli, F; Martoglia, R; Milic, J; and Missier, P. In DARLI workshop - Proceedings of the Workshops of the EDBT/ICDT 2020 Joint Conference, Copenhagen, Denmark, 2020. CEUR-WS
link bibtex abstract

@inproceedings{ferrari_data-driven_2020,
	address = {Copenhagen, Denmark},
	title = {Data-driven vs knowledge-driven inference of health outcomes in the ageing population: a case study},
	abstract = {Preventive, Predictive, Personalised and Participative (P4) medicine has the potential to not only vastly improve people's quality of life, but also to significantly reduce healthcare costs and improve its efficiency. Our research focuses on age-related diseases and explores the opportunities offered by a data-driven approach to predict wellness states of ageing individuals, in contrast to the commonly adopted knowledge-driven approach that relies on easy-to-interpret metrics manually introduced by clinical experts. This is done by means of machine learning models applied on the My Smart Age with HIV (MySAwH) dataset, which is collected through a relatively new approach especially for older HIV patient cohorts. This includes Patient Related Outcomes values from mobile smartphone apps and activity traces from commercial-grade activity loggers. Our results show better predictive performance for the data-driven approach. We also show that a \textit{post hoc} interpretation method applied to the predictive models can provide intelligible explanations that enable new forms of personalised and preventive medicine.},
	booktitle = {{DARLI} workshop - {Proceedings} of the {Workshops} of the {EDBT}/{ICDT} 2020 {Joint} {Conference}},
	publisher = {CEUR-WS},
	author = {Ferrari, D and Guaraldi, G and Mandreoli, F and Martoglia, R and Milic, J and Missier, P.},
	year = {2020},
	keywords = {\#machine learning, \#ageing, \#explainable models},
}

Abstracting PROV provenance graphs: A validity-preserving approach. Missier, P.; Bryans, J.; Gamble, C.; and Curcin, V. Future Generation Computer Systems, 111: 352 – 367. 2020.
doi link bibtex abstract 2 downloads

@article{missier_abstracting_2020,
	title = {Abstracting {PROV} provenance graphs: {A} validity-preserving approach},
	volume = {111},
	issn = {0167-739X},
	doi = {https://doi.org/10.1016/j.future.2020.05.015},
	abstract = {Data provenance is a structured form of metadata designed to record the activities and datasets involved in data production, as well as their dependency relationships. The PROV data model, released by the W3C in 2013, defines a schema and constraints that together provide a structural and semantic foundation for provenance. This enables the interoperable exchange of provenance between data producers and consumers. When the provenance content is sensitive and subject to disclosure restrictions, however, a way of hiding parts of the provenance in a principled way before communicating it to certain parties is required. In this paper we present a provenance abstraction operator that achieves this goal. It maps a graphical representation of a PROV document PG1 to a new abstract version PG2, ensuring that (i) PG2 is a valid PROV graph, and (ii) the dependencies that appear in PG2 are justified by those that appear in PG1. These two properties ensure that further abstraction of abstract PROV graphs is possible. A guiding principle of the work is that of minimum damage: the resultant graph is altered as little as possible, while ensuring that the two properties are maintained. The operator developed is implemented as part of a user tool, described in a separate paper, that lets owners of sensitive provenance information control the abstraction by specifying an abstraction policy.},
	journal = {Future Generation Computer Systems},
	author = {Missier, P. and Bryans, J. and Gamble, C. and Curcin, V.},
	year = {2020},
	keywords = {Provenance, Provenance abstraction, Provenance metadata},
	pages = {352 -- 367},
}

2019 (7)

Increasing phenotypic annotation improves the diagnostic rate of exome sequencing in a rare neuromuscular disorder. Thompson, R.; Papakonstantinou Ntalis, A.; Beltran, S.; Tapf, A.; de Paula Estephan, E.; Polavarapu, K.; ’t Hoen, P. A. C.; Missier, P.; and Lochmuller, H. Human Mutation. 2019.

Increasing phenotypic annotation improves the diagnostic rate of exome sequencing in a rare neuromuscular disorder [link]

Paper doi link bibtex abstract

@article{thompson_increasing_2019,
	title = {Increasing phenotypic annotation improves the diagnostic rate of exome sequencing in a rare neuromuscular disorder},
	url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/humu.23792},
	doi = {10.1002/humu.23792},
	abstract = {Abstract Phenotype-based filtering and prioritization contribute to the interpretation of genetic variants detected in exome sequencing. However, it is currently unclear how extensive this phenotypic annotation should be. In this study, we compare methods for incorporating phenotype into the interpretation process and assess the extent to which phenotypic annotation aids prioritization of the correct variant. Using a cohort of 29 patients with congenital myasthenic syndromes with causative variants in known or newly discovered disease genes, exome data and the Human Phenotype Ontology (HPO)-coded phenotypic profiles, we show that gene-list filters created from phenotypic annotations perform similarly to curated disease-gene virtual panels. We use Exomiser, a prioritization tool incorporating phenotypic comparisons, to rank candidate variants while varying phenotypic annotation. Analyzing 3,712 combinations, we show that increasing phenotypic annotation improved prioritization of the causative variant, from 62\% ranked first on variant alone to 90\% with seven HPO annotations. We conclude that any HPO-based phenotypic annotation aids variant discovery and that annotation with over five terms is recommended in our context. Although focused on a constrained cohort, this provides real-world validation of the utility of phenotypic annotation for variant prioritization. Further research is needed to extend this concept to other diseases and more diverse cohorts.},
	journal = {Human Mutation},
	author = {Thompson, Rachel and Papakonstantinou Ntalis, Anastasios and Beltran, Sergi and Tapf, Ana and de Paula Estephan, Eduardo and Polavarapu, Kiran and ’t Hoen, Peter A. C. and Missier, Paolo and Lochmuller, Hanns},
	year = {2019},
	keywords = {congenital myasthenic syndromes, deep phenotyping, diagnosis, exome sequencing, Exomiser, human phenotype ontology, variant prioritization},
}

Parametrised Data Sampling for Fairness Optimisation. Gonz�lez Zelaya, C. V.; Missier, P.; and Prangle, D. In Proceedings of Explainable AI for Fairness, Accountability & Transparency Workshop (KDD XAI), 2019. ACM
link bibtex

@inproceedings{gonzlez_zelaya_parametrised_2019,
	title = {Parametrised {Data} {Sampling} for {Fairness} {Optimisation}},
	booktitle = {Proceedings of {Explainable} {AI} for {Fairness}, {Accountability} \& {Transparency} {Workshop} ({KDD} {XAI})},
	publisher = {ACM},
	author = {Gonz�lez Zelaya, Carlos Vladimiro and Missier, Paolo and Prangle, Dennis},
	year = {2019},
}

Toward a Decentralized, Trust-less Marketplace for Brokered IoT Data Trading using Blockchain. Bajoudah, S.; Changyu, D.; and Missier, P. In Procs. 2nd IEEE International Conference on Blockchain (Blockchain 2019), Atlanta, USA, 2019. IEEE
link bibtex

@inproceedings{bajoudah_toward_2019,
	address = {Atlanta, USA},
	title = {Toward a {Decentralized}, {Trust}-less {Marketplace} for {Brokered} {IoT} {Data} {Trading} using {Blockchain}},
	booktitle = {Procs. 2nd {IEEE} {International} {Conference} on {Blockchain} ({Blockchain} 2019)},
	publisher = {IEEE},
	author = {Bajoudah, Shaimaa and Changyu, Dong and Missier, Paolo},
	year = {2019},
}

Efficient Re-computation of Big Data Analytics Processes in the Presence of Changes: Computational Framework, Reference Architecture, and Applications. Missier, P.; and Cala, J. In Procs. IEEE Big Data Congress, Milano, Italy, 2019. IEEE
link bibtex

@inproceedings{missier_efficient_2019,
	address = {Milano, Italy},
	title = {Efficient {Re}-computation of {Big} {Data} {Analytics} {Processes} in the {Presence} of {Changes}: {Computational} {Framework}, {Reference} {Architecture}, and {Applications}},
	booktitle = {Procs. {IEEE} {Big} {Data} {Congress}},
	publisher = {IEEE},
	author = {Missier, Paolo and Cala, Jacek},
	year = {2019},
	keywords = {\#provenance, \#re-computation, \#workflow},
}

A customisable pipeline for continuously harvesting socially-minded Twitter users. Primo, F.; Missier, P.; Romanovsky, A.; Mickael, F.; and Cacho, N. In procs. ICWE'19, Daedjeon, Korea, 2019.

Paper link bibtex abstract

@inproceedings{primo_customisable_2019,
	address = {Daedjeon, Korea},
	title = {A customisable pipeline for continuously harvesting socially-minded {Twitter} users},
	url = {https://arxiv.org/abs/1903.07061},
	abstract = {On social media platforms and Twitter in particular, specific classes of users such as influencers have been given satisfactory operational definitions in terms of network and content metrics. Others, for instance online activists, are not less important but their characterisation still requires experimenting. We make the hypothesis that such interesting users can be found within temporally and spatially localised contexts, i.e., small but topical fragments of the network containing interactions about social events or campaigns with a significant footprint on Twitter. To explore this hypothesis, we have designed a continuous user profile discovery pipeline that produces an ever-growing dataset of user profiles by harvesting and analysing contexts from the Twitter stream. The profiles dataset includes key network and content-based users metrics, enabling experimentation with user-defined score functions that characterise specific classes of online users. The paper describes the design and implementation of the pipeline and its empirical evaluation on a case study consisting of healthcare-related campaigns in the UK, showing how it supports the operational definitions of online activism, by comparing three experimental ranking functions. The code is publicly available.},
	booktitle = {procs. {ICWE}'19},
	author = {Primo, Flavio and Missier, Paolo and Romanovsky, Alexander and Mickael, Figueredo and Cacho, Nelio},
	year = {2019},
	keywords = {online activism, twitter analytics},
}

Why-Diff: Exploiting Provenance to Understand Outcome Differences from non-identical Reproduced Workflows. Thavasimani, P.; Cala, J.; and Missier, P. IEEE Access,1–1. 2019.

Why-Diff: Exploiting Provenance to Understand Outcome Differences from non-identical Reproduced Workflows [link]

Paper doi link bibtex abstract 5 downloads

@article{thavasimani_why-diff_2019,
	title = {Why-{Diff}: {Exploiting} {Provenance} to {Understand} {Outcome} {Differences} from non-identical {Reproduced} {Workflows}},
	issn = {2169-3536},
	url = {https://ieeexplore.ieee.org/document/8662612/},
	doi = {10.1109/ACCESS.2019.2903727},
	abstract = {Data analytics processes such as scientific workflows tend to be executed repeatedly, with varying dependencies and input datasets. The case has been made in the past for tracking the provenance of the final information products through the workflow steps, to enable their reproducibility. In this work, we explore the hypothesis that provenance traces recorded during execution are also instrumental to answering questions about the observed differences between sets of results obtained from similar but not identical workflow configurations. Such differences in configurations may be introduced deliberately, i.e., to explore process variations, or accidentally, typically as the result of porting efforts or of changes in the computing environment. Using a commonly used workflow programming model as a reference, we consider both structural variations in the workflows as well as variations within their individual components. Our whydiff algorithm compares the graph representations of two provenance traces derived from two workflow variations. It produces a delta graph that can be used to produce human-readable explanations of the impact of workflow differences on observed output differences. We report on our Neo4j graph database. We also report explanations of difference between workflow results using a suite of synthetic workflows as well as real-world workflows.},
	journal = {IEEE Access},
	author = {Thavasimani, Priyaa and Cala, Jacek and Missier, Paolo},
	year = {2019},
	keywords = {eScience Central, Provenance, Big Data, Why-Diff, Workflow, Reproducibility, Software, Alzheimer's disease, Databases, Genetics, Libraries, Sentiment analysis},
	pages = {1--1},
}

Targeted therapies for congenital myasthenic syndromes: systematic review and steps towards a treatabolome. Thompson, R.; Bonne, G.; Missier, P.; and Lochmüller, H. Emerging Topics in Life Sciences,ETLS20180100. January 2019.

Targeted therapies for congenital myasthenic syndromes: systematic review and steps towards a treatabolome [link]

Paper doi link bibtex abstract

@article{thompson_targeted_2019,
	title = {Targeted therapies for congenital myasthenic syndromes: systematic review and steps towards a treatabolome},
	url = {http://www.emergtoplifesci.org/content/early/2019/01/25/ETLS20180100.abstract},
	doi = {10.1042/ETLS20180100},
	abstract = {Despite recent scientific advances, most rare genetic diseases ? including most neuromuscular diseases ? do not currently have curative gene-based therapies available. However, in some cases, such as vitamin, cofactor or enzyme deficiencies, channelopathies and disorders of the neuromuscular junction, a confirmed genetic diagnosis provides guidance on treatment, with drugs available that may significantly alter the disease course, improve functional ability and extend life expectancy. Nevertheless, many treatable patients remain undiagnosed or do not receive treatment even after genetic diagnosis. The growth of computer-aided genetic analysis systems that enable clinicians to diagnose their undiagnosed patients has not yet been matched by genetics-based decision-support systems for treatment guidance. Generating a ?treatabolome' of treatable variants and the evidence for the treatment has the potential to increase treatment rates for treatable conditions. Here, we use the congenital myasthenic syndromes (CMS), a group of clinically and genetically heterogeneous but frequently treatable neuromuscular conditions, to illustrate the steps in the creation of a treatabolome for rare inherited diseases. We perform a systematic review of the evidence for pharmacological treatment of each CMS type, gathering evidence from 207 studies of over 1000 patients and stratifying by genetic defect, as treatment varies depending on the underlying cause. We assess the strength and quality of the evidence and create a dataset that provides the foundation for a computer-aided system to enable clinicians to gain easier access to information about treatable variants and the evidence they need to consider.3,4-DAP, 3,4-diaminopyridine; AChE, acetylcholinesterase; AChR, acetylcholine receptor; CEBM, Centre for evidence-based medicine; CMS, congenital myasthenic syndrome; NGS, next-generation sequencing; NMJ, neuromuscular junction},
	journal = {Emerging Topics in Life Sciences},
	author = {Thompson, Rachel and Bonne, Gisèle and Missier, Paolo and Lochmüller, Hanns},
	month = jan,
	year = {2019},
	pages = {ETLS20180100},
}

2018 (8)

Selective and Recurring Re-computation of Big Data Analytics Tasks: Insights from a Genomics Case Study. Cal�a, J.; and Missier, P. Big Data Research, 13: 76 – 94. 2018.

Selective and Recurring Re-computation of Big Data Analytics Tasks: Insights from a Genomics Case Study [link]

Paper doi link bibtex abstract 1 download

@article{cala_selective_2018,
	title = {Selective and {Recurring} {Re}-computation of {Big} {Data} {Analytics} {Tasks}: {Insights} from a {Genomics} {Case} {Study}},
	volume = {13},
	issn = {2214-5796},
	url = {http://www.sciencedirect.com/science/article/pii/S2214579617303520},
	doi = {https://doi.org/10.1016/j.bdr.2018.06.001},
	abstract = {The value of knowledge assets generated by analytics processes using Data Science techniques tends to decay over time, as a consequence of changes in the elements the process depends on: external data sources, libraries, and system dependencies. For large-scale problems, refreshing those outcomes through greedy re-computation is both expensive and inefficient, as some changes have limited impact. In this paper we address the problem of refreshing past process outcomes selectively, that is, by trying to identify the subset of outcomes that will have been affected by a change, and by only re-executing fragments of the original process. We propose a technical approach to address the selective re-computation problem by combining multiple techniques, and present an extensive experimental study in Genomics, namely variant calling and their clinical interpretation, to show its effectiveness. In this case study, we are able to decrease the number of required re-computations on a cohort of individuals from 495 (blind) down to 71, and that we can reduce runtime by at least 60\% relative to the naïve blind approach, and in some cases by 90\%. Starting from this experience, we then propose a blueprint for a generic re-computation meta-process that makes use of process history metadata to make informed decisions about selective re-computations in reaction to a variety of changes in the data.},
	journal = {Big Data Research},
	author = {Cal�a, Jacek and Missier, Paolo},
	year = {2018},
	keywords = {Big data analysis, Genomics, Knowledge decay, Re-computation},
	pages = {76 -- 94},
}

Versioned-PROV: A PROV extension to support mutable data entities. Pimentel, J. F.; Missier, P.; Murta, L.; and Braganholo, V. In Procs. IPAW 2018, London, 2018. Springer
link bibtex

@inproceedings{pimentel_versioned-prov_2018,
	address = {London},
	title = {Versioned-{PROV}: {A} {PROV} extension to support mutable data entities},
	booktitle = {Procs. {IPAW} 2018},
	publisher = {Springer},
	author = {Pimentel, Joao Felipe and Missier, Paolo and Murta, Leonardo and Braganholo, Vanessa},
	year = {2018},
	keywords = {\#provenance, \#recomputation, process re-computation, provenance annotations},
}

Editorial: Special Issue on Improving the Veracity and Value of Big Data. Geerts, F.; Missier, P.; and Paton, N. W. J. Data and Information Quality, 9(3): 13:1–13:2. 2018.

Editorial: Special Issue on Improving the Veracity and Value of Big Data [link]

Paper doi link bibtex

@article{geerts_editorial_2018,
	title = {Editorial: {Special} {Issue} on {Improving} the {Veracity} and {Value} of {Big} {Data}},
	volume = {9},
	url = {http://doi.acm.org/10.1145/3174791},
	doi = {10.1145/3174791},
	number = {3},
	journal = {J. Data and Information Quality},
	author = {Geerts, Floris and Missier, Paolo and Paton, Norman W.},
	year = {2018},
	pages = {13:1--13:2},
}

Design and evaluation of a genomics variant analysis pipeline using GATK Spark tools. Tucci, N.; Cala, J.; Steyn, J.; and Missier, P. In Procs. SEBD '18 – 26TH Italian Symposium on Advanced Database Systems, Bari, italy, 2018.
link bibtex abstract

@inproceedings{tucci_design_2018,
	address = {Bari, italy},
	title = {Design and evaluation of a genomics variant analysis pipeline using {GATK} {Spark} tools},
	abstract = {Scalable and efficient processing of genome sequence data, i.e. for variant discovery, is key to the mainstream adoption of High Throughput technology for disease prevention and for clinical use. Achieving scalability, however, requires a significant effort to enable the parallel execution of the analysis tools that make up the pipelines. This is facilitated by the new Spark versions of the well-known GATK toolkit, which offer a black-box approach by transparently exploiting the underlying Map Reduce architecture. In this paper we report on our experience implementing a standard variant discovery pipeline using GATK 4.0 with Docker-based deployment over a cluster. We provide a preliminary performance analysis, comparing the processing times and cost to those of the new Microsoft Genomics Services.},
	booktitle = {Procs. {SEBD} '18 – {26TH} {Italian} {Symposium} on {Advanced} {Database} {Systems}},
	author = {Tucci, Nicholas and Cala, Jacek and Steyn, Jannetta and Missier, Paolo},
	year = {2018},
	keywords = {\#genomics, \#spark},
}

Provenance Annotation and Analysis to Support Process Re-Computation. Cala, J.; and Missier, P. In Procs. IPAW 2018, London, 2018. Springer
link bibtex abstract

@inproceedings{cala_provenance_2018,
	address = {London},
	title = {Provenance {Annotation} and {Analysis} to {Support} {Process} {Re}-{Computation}},
	abstract = {Many resource-intensive analytics processes evolve over time following new versions of the reference datasets and software dependen- cies they use. We focus on scenarios in which any version change has the potential to affect many outcomes, as is the case for instance in high throughput genomics where the same process is used to analyse large cohorts of patient genomes, or cases. As any version change is unlikely to affect the entire population, an efficient strategy for restoring the cur- rency of the outcomes requires first to identify the scope of a change, i.e., the subset of affected data products. In this paper we describe a generic and reusable provenance-based approach to address this scope discovery problem. It applies to a scenario where the process consists of complex hierarchical components, where different input cases are processed using different version configurations of each component, and where separate provenance traces are collected for the executions of each of the com- ponents. We show how a new data structure, called a restart tree, is computed and exploited to manage the change scope discovery problem.},
	booktitle = {Procs. {IPAW} 2018},
	publisher = {Springer},
	author = {Cala, Jacek and Missier, Paolo},
	year = {2018},
	keywords = {\#provenance, \#recomputation, process re-computation, provenance annotations},
}

Analyzing Social Network Images with Deep Learning Models to Fight Zika Virus. Barros, H. P.; Lima, B. G. C.; Crispim, F. C.; Vieira, T.; Missier, P.; and Fonseca, B. In Procs. 15th International Conference on Image Analysis and Recognition (ICIAR'18), 2018.
link bibtex

@inproceedings{barros_analyzing_2018,
	title = {Analyzing {Social} {Network} {Images} with {Deep} {Learning} {Models} to {Fight} {Zika} {Virus}},
	booktitle = {Procs. 15th {International} {Conference} on {Image} {Analysis} and {Recognition} ({ICIAR}'18)},
	author = {Barros, H. Pedro and Lima, Bruno G. C. and Crispim, Felipe C. and Vieira, Tiago and Missier, Paolo and Fonseca, Baldoino},
	year = {2018},
	keywords = {\#zika},
}

VazaDengue: An information system for preventing and combating mosquito-borne diseases with social networks. Sousa, L.; Mello, R. d.; Cedrim, D.; Garcia, A.; Missier, P.; Uch�a, A.; Oliveira, A.; and Romanovsky, A. Information Systems, 75: 26 – 42. 2018.

Paper doi link bibtex

@article{sousa_vazadengue_2018,
	title = {{VazaDengue}: {An} information system for preventing and combating mosquito-borne diseases with social networks},
	volume = {75},
	issn = {0306-4379},
	url = {http://www.sciencedirect.com/science/article/pii/S030643791730618X},
	doi = {10.1016/j.is.2018.02.003},
	journal = {Information Systems},
	author = {Sousa, Leonardo and Mello, Rafael de and Cedrim, Diego and Garcia, Alessandro and Missier, Paolo and Uch�a, Anderson and Oliveira, Anderson and Romanovsky, Alexander},
	year = {2018},
	keywords = {Dengue, Mosquito, Social media, Surveillance, Tweets},
	pages = {26 -- 42},
}

Loom: Query-aware Partitioning of Online Graphs. Firth, H; and Missier, P In Procs. 21st International Conference on Extending Database Technology (EDBT), Vienna, Austria, 2018. EDBT

Loom: Query-aware Partitioning of Online Graphs [link]

Paper link bibtex abstract

@inproceedings{firth_loom_2018,
	address = {Vienna, Austria},
	title = {Loom: {Query}-aware {Partitioning} of {Online} {Graphs}},
	url = {http://edbticdt2018.at/},
	abstract = {As with general graph processing systems, partitioning data over a cluster of machines improves the scalability of graph database management systems. However, these systems will incur additional network cost during the execution of a query workload, due to inter- partition traversals. Workload-agnostic partitioning algorithms typically minimise the likelihood of any edge crossing partition boundaries. However, these partitioners are sub-optimal with re- spect to many workloads, especially queries, which may require more frequent traversal of speci c subsets of inter-partition edges. Furthermore, they largely unsuited to operating incrementally on dynamic, growing graphs. We present a new graph partitioning algorithm, Loom, that op- erates on a stream of graph updates and continuously allocates the new vertices and edges to partitions, taking into account a query workload of graph pattern expressions along with their relative frequencies. First we capture the most common patterns of edge traversals which occur when executing queries. We then compare sub-graphs, which present themselves incrementally in the graph update stream, against these common patterns. Finally we attempt to allocate each match to single partitions, reducing the number of inter-partition edges within frequently traversed sub-graphs and improving average query performance. Loom is extensively evaluated over several large test graphs with realistic query workloads and various orderings of the graph updates. We demonstrate that, given a workload, our prototype produces partitionings of signi cantly better quality than existing streaming graph partitioning algorithms Fennel \& LDG.},
	booktitle = {Procs. 21st {International} {Conference} on {Extending} {Database} {Technology} ({EDBT})},
	publisher = {EDBT},
	author = {Firth, H and Missier, P},
	year = {2018},
	keywords = {distributed graphs, graph partitioning},
}

2017 (8)

Why-Diff: Explaining differences amongst similar workflow runs by exploiting scientific metadata. Thavasimani, P.; Cala, J.; and Missier, P. In 2017 IEEE International Conference on Big Data, BigData 2017, Boston, MA, USA, December 11-14, 2017, pages 3031–3041, 2017.

Why-Diff: Explaining differences amongst similar workflow runs by exploiting scientific metadata [link]

Paper doi link bibtex

@inproceedings{thavasimani_why-diff_2017,
	title = {Why-{Diff}: {Explaining} differences amongst similar workflow runs by exploiting scientific metadata},
	url = {https://doi.org/10.1109/BigData.2017.8258275},
	doi = {10.1109/BigData.2017.8258275},
	booktitle = {2017 {IEEE} {International} {Conference} on {Big} {Data}, {BigData} 2017, {Boston}, {MA}, {USA}, {December} 11-14, 2017},
	author = {Thavasimani, Priyaa and Cala, Jacek and Missier, Paolo},
	year = {2017},
	pages = {3031--3041},
}

Adaptive Incremental Learning for Statistical Relational Models Using Gradient-Based Boosting. Gu, Y.; and Missier, P. In Procs. ILP '17, 27th International Conference on Inductive Logic Programming (late-breaking paper), Orleans, France, 2017. CEUR-WS

Adaptive Incremental Learning for Statistical Relational Models Using Gradient-Based Boosting [pdf]

Paper link bibtex abstract

@inproceedings{gu_adaptive_2017,
	address = {Orleans, France},
	title = {Adaptive {Incremental} {Learning} for {Statistical} {Relational} {Models} {Using} {Gradient}-{Based} {Boosting}},
	url = {https://ilp2017.sciencesconf.org/data/pages/ILP_2017_paper_27.pdf},
	abstract = {We consider the problem of incrementally learning models from relational data. Most existing learning methods for statistical relational models use batch learning, which becomes computationally expensive and eventually infeasible for large datasets. The majority of the previous work in relational incremental learning assumes the model's structure is given and only the model's parameters needed to be learned. In this paper, we propose algorithms that can incrementally learn the model's parameters and structure simultaneously. These algorithms are based on the successful formalisation of the relational functional gradient boosting system (RFGB), and extend the classical propositional ensemble methods to relational learning for handling evolving data streams.},
	booktitle = {Procs. {ILP} '17, 27th {International} {Conference} on {Inductive} {Logic} {Programming} (late-breaking paper)},
	publisher = {CEUR-WS},
	author = {Gu, Yulong and Missier, Paolo},
	year = {2017},
}

Mind My Value: a Decentralized Infrastructure for Fair and Trusted IoT Data Trading. Missier, P.; Bajoudah, S.; Capossele, A.; Gaglione, A.; and Nati, M. In Procs. 7th International Conference on the Internet of Things, Linz,Austria, 2017.

Mind My Value: a Decentralized Infrastructure for Fair and Trusted IoT Data Trading [link]

Paper link bibtex abstract

@inproceedings{missier_mind_2017,
	address = {Linz,Austria},
	title = {Mind {My} {Value}: a {Decentralized} {Infrastructure} for {Fair} and {Trusted} {IoT} {Data} {Trading}},
	url = {http://iot-conference.org/iot2017/},
	abstract = {Internet of Things (IoT) data are increasingly viewed as a new form of massively distributed and large scale digital assets, which are continuously generated by millions of connected devices. The real value of such assets can only be realized by allowing IoT data trading to occur on a marketplace that rewards every single producer and consumer, at a very granular level. Crucially, we believe that such a marketplace should not be owned by anybody, and should instead fairly and transparently self-enforce a well defined set of governance rules. In this paper we address some of the technical challenges involved in realizing such a marketplace. We leverage emerging blockchain technologies to build a decentralized, trusted, transparent and open architecture for IoT traffic metering and contract compliance, on top of the largely adopted IoT brokered data infrastructure. We discuss an Ethereum-based prototype implementation and experimentally evaluate the overhead cost associated with Smart Contract transactions, concluding that a viable business model can indeed be associated with our technical approach.},
	booktitle = {Procs. 7th {International} {Conference} on the {Internet} of {Things}},
	author = {Missier, Paolo and Bajoudah, Shaimaa and Capossele, Angelo and Gaglione, Andrea and Nati, Michele},
	year = {2017},
	keywords = {\#IoT, \#marketplace},
}

Preserving the value of large scale data analytics over time through selective re-computation. Missier, P.; Cala, J.; and Rathi, M. In Procs. 31st British International Conference on Databases - BICOD, 2017.
link bibtex

@inproceedings{missier_preserving_2017,
	title = {Preserving the value of large scale data analytics over time through selective re-computation},
	booktitle = {Procs. 31st {British} {International} {Conference} on {Databases} - {BICOD}},
	author = {Missier, Paolo and Cala, Jacek and Rathi, Manisha},
	year = {2017},
}

Recruiting from the Network: Discovering Twitter Users Who Can Help Combat Zika Epidemics. Missier, P.; McClean, C.; Carlton, J.; Cedrim, D.; Silva, L.; Garcia, A.; Plastino, A.; and Romanovsky, A. In Cabot, J.; De Virgilio, R.; and Torlone, R., editor(s), Web Engineering: 17th International Conference, ICWE 2017, Rome, Italy, June 5-8, 2017, Proceedings, pages 437–445, Roma, Italy, 2017. Springer International Publishing

Paper doi link bibtex

@inproceedings{missier_recruiting_2017,
	address = {Roma, Italy},
	title = {Recruiting from the {Network}: {Discovering} {Twitter} {Users} {Who} {Can} {Help} {Combat} {Zika} {Epidemics}},
	isbn = {978-3-319-60131-1},
	url = {http://dx.doi.org/10.1007/978-3-319-60131-1_30},
	doi = {10.1007/978-3-319-60131-1_30},
	booktitle = {Web {Engineering}: 17th {International} {Conference}, {ICWE} 2017, {Rome}, {Italy}, {June} 5-8, 2017, {Proceedings}},
	publisher = {Springer International Publishing},
	author = {Missier, Paolo and McClean, Callum and Carlton, Jonathan and Cedrim, Diego and Silva, Leonardo and Garcia, Alessandro and Plastino, Alexandre and Romanovsky, Alexander},
	editor = {Cabot, Jordi and De Virgilio, Roberto and Torlone, Riccardo},
	year = {2017},
	pages = {437--445},
}

Provenance Standards. Missier, P. In Liu, L.; and Özsu, M T., editor(s), Encyclopedia of Database Systems, pages 1–8. Springer New York, New York, NY, 2017.

Paper doi link bibtex

@incollection{missier_provenance_2017,
	address = {New York, NY},
	title = {Provenance {Standards}},
	isbn = {978-1-4899-7993-3},
	url = {https://doi.org/10.1007/978-1-4899-7993-3_80749-1},
	booktitle = {Encyclopedia of {Database} {Systems}},
	publisher = {Springer New York},
	author = {Missier, Paolo},
	editor = {Liu, Ling and Özsu, M Tamer},
	year = {2017},
	doi = {10.1007/978-1-4899-7993-3_80749-1},
	pages = {1--8},
}

TAPER: query-aware, partition-enhancement for large, heterogenous graphs. Firth, H.; and Missier, P. Distributed and Parallel Databases,1–31. 2017.

TAPER: query-aware, partition-enhancement for large, heterogenous graphs [link]

Paper doi link bibtex abstract

@article{firth_taper_2017,
	title = {{TAPER}: query-aware, partition-enhancement for large, heterogenous graphs},
	issn = {1573-7578},
	url = {http://dx.doi.org/10.1007/s10619-017-7196-y},
	doi = {10.1007/s10619-017-7196-y},
	abstract = {Graph partitioning has long been seen as a viable approach to addressing Graph DBMS scalability. A partitioning, however, may introduce extra query processing latency unless it is sensitive to a specific query workload, and optimised to minimise inter-partition traversals for that workload. Additionally, it should also be possible to incrementally adjust the partitioning in reaction to changes in the graph topology, the query workload, or both. Because of their complexity, current partitioning algorithms fall short of one or both of these requirements, as they are designed for offline use and as one-off operations. The TAPER system aims to address both requirements, whilst leveraging existing partitioning algorithms. TAPER takes any given initial partitioning as a starting point, and iteratively adjusts it by swapping chosen vertices across partitions, heuristically reducing the probability of inter-partition traversals for a given path queries workload. Iterations are inexpensive thanks to time and space optimisations in the underlying support data structures. We evaluate TAPER on two different large test graphs and over realistic query workloads. Our results indicate that, given a hash-based partitioning, TAPER reduces the number of inter-partition traversals by \$\${\textbackslash}textbackslashbackslashsim \$\$ ? 80\%; given an unweighted Metis partitioning, by \$\${\textbackslash}textbackslashbackslashsim \$\$ ? 30\%. These reductions are achieved within eight iterations and with the additional advantage of being workload-aware and usable online.},
	journal = {Distributed and Parallel Databases},
	author = {Firth, Hugo and Missier, Paolo},
	year = {2017},
	pages = {1--31},
}

Revealing the Detailed Lineage of Script Outputs using Hybrid Provenance. Zhang, Q.; Cao, Y.; Wang, Q.; Vu, D.; Thavasimani, P.; McPhillips, T.; Missier, P.; Slaughter, P.; Jones, C.; Jones, M. B; and Ludascher, B. In Procs. 11th Intl. Digital Curation Conference (IDCC), Edinburgh, Scotland, UK, 2017. Digital Curation Center
link bibtex abstract

@inproceedings{zhang_revealing_2017,
	address = {Edinburgh, Scotland, UK},
	title = {Revealing the {Detailed} {Lineage} of {Script} {Outputs} using {Hybrid} {Provenance}},
	abstract = {We illustrate how combining retrospective and prospective provenance can yield scientifically meaningful hybrid provenance representations of the computational histories of data produced during a script run. We use scripts from multiple disciplines (astrophysics, climate science, biodiversity data curation, and social network analysis), implemented in Python, R, and MATLAB, to highlight the usefulness of diverse forms of retrospective provenance when coupled with prospective provenance. Users provide prospective provenance (i.e., the conceptual workflows latent in scripts) via simple YesWorkflow annotations, embedded as script comments. Runtime observables, hidden in filenames or folder structures, recorded in log-files, or automatically captured using tools such as noWorkflow or the DataONE RunManagers can be linked to prospective provenance via relational views and queries. The YesWorkflow toolkit, example scripts, and demonstration code are available via an open source repository.},
	booktitle = {Procs. 11th {Intl}. {Digital} {Curation} {Conference} ({IDCC})},
	publisher = {Digital Curation Center},
	author = {Zhang, Qian and Cao, Yang and Wang, Qiwen and Vu, Duc and Thavasimani, Priyaa and McPhillips, Tim and Missier, Paolo and Slaughter, Peter and Jones, Christopher and Jones, Matthew B and Ludascher, Bertram},
	year = {2017},
	keywords = {\#provenance},
}

2016 (10)

Facilitating reproducible research by investigating computational metadata. Thavasimani, P.; and Missier, P. In 2016 IEEE International Conference on Big Data, BigData 2016, Washington DC, USA, December 5-8, 2016, pages 3045–3051, 2016.

Facilitating reproducible research by investigating computational metadata [link]

Paper doi link bibtex 1 download

@inproceedings{thavasimani_facilitating_2016,
	title = {Facilitating reproducible research by investigating computational metadata},
	url = {https://doi.org/10.1109/BigData.2016.7840958},
	doi = {10.1109/BigData.2016.7840958},
	booktitle = {2016 {IEEE} {International} {Conference} on {Big} {Data}, {BigData} 2016, {Washington} {DC}, {USA}, {December} 5-8, 2016},
	author = {Thavasimani, Priyaa and Missier, Paolo},
	year = {2016},
	pages = {3045--3051},
}

Clustering Provenance Facilitating Provenance Exploration Through Data Abstraction. Karsai, L.; Fekete, A.; Kay, J.; and Missier, P. In Proceedings of the Workshop on Human-In-the-Loop Data Analytics, of HILDA '16, pages 6:1–6:5, New York, NY, USA, 2016. ACM

Clustering Provenance Facilitating Provenance Exploration Through Data Abstraction [link]

Paper doi link bibtex

@inproceedings{karsai_clustering_2016,
	address = {New York, NY, USA},
	series = {{HILDA} '16},
	title = {Clustering {Provenance} {Facilitating} {Provenance} {Exploration} {Through} {Data} {Abstraction}},
	isbn = {978-1-4503-4207-0},
	url = {http://doi.acm.org/10.1145/2939502.2939508},
	doi = {10.1145/2939502.2939508},
	booktitle = {Proceedings of the {Workshop} on {Human}-{In}-the-{Loop} {Data} {Analytics}},
	publisher = {ACM},
	author = {Karsai, Linus and Fekete, Alan and Kay, Judy and Missier, Paolo},
	year = {2016},
	keywords = {provenance, large-scale graphs, visualisation},
	pages = {6:1--6:5},
}

Alan Turing Intitute Symposium on Reproducibioity for Data-Intensive Research – Final Report. Burgess, L. C; Crotty, D.; de Roure, D.; Gibbons, J.; Goble, C.; Missier, P.; Mortier, R.; Nichols, T. E; and O�Beirne, R. . 2016.

Alan Turing Intitute Symposium on Reproducibioity for Data-Intensive Research – Final Report [link]

Paper link bibtex

@article{burgess_alan_2016,
	title = {Alan {Turing} {Intitute} {Symposium} on {Reproducibioity} for {Data}-{Intensive} {Research} – {Final} {Report}},
	url = {https://dx.doi.org/10.6084/m9.figshare.3487382},
	author = {Burgess, Lucie C and Crotty, David and de Roure, David and Gibbons, Jeremy and Goble, Carole and Missier, Paolo and Mortier, Richard and Nichols, Thomas E and O�Beirne, Richard},
	year = {2016},
}

The lifecycle of provenance metadata and its associated challenges and opportunities. Missier, P. In Lemieux, V., editor(s), Building Trust in Financial Information - Perspectives on the Frontiers of Provenance., volume Springer. Springer, 2016.

The lifecycle of provenance metadata and its associated challenges and opportunities [link]

Paper link bibtex abstract

@incollection{missier_lifecycle_2016,
	title = {The lifecycle of provenance metadata and its associated challenges and opportunities},
	volume = {Springer},
	url = {http://arxiv.org/abs/1605.01229},
	abstract = {This chapter outlines some of the challenges and opportunities associated with adopting provenance principles [CFLV12] and stan- dards [MGC+15] in a variety of disciplines, including data publication and reuse, and information sciences.},
	booktitle = {Building {Trust} in {Financial} {Information} - {Perspectives} on the {Frontiers} of {Provenance}.},
	publisher = {Springer},
	author = {Missier, Paolo},
	editor = {Lemieux, Victoria},
	year = {2016},
	keywords = {\#provenance},
}

The data, they are a-changin'. Missier, P.; Cala, J.; and Wijaya, E. In Cohen-Boulakia, S., editor(s), Proc. TAPP'16 (Theory and Practice of Provenance), Washington D.C., USA, 2016. USENIX Association

Paper link bibtex abstract

@inproceedings{missier_data_2016,
	address = {Washington D.C., USA},
	title = {The data, they are a-changin'},
	url = {https://arxiv.org/abs/1604.06412},
	abstract = {The cost of deriving actionable knowledge from large datasets has been decreasing thanks to a convergence of positive fac- tors: low cost data generation, inexpensively scalable stor- age and processing infrastructure (cloud), software frame- works and tools for massively distributed data processing, and parallelisable data analytics algorithms. One observa- tion that is often overlooked, however, is that each of these elements is not immutable, rather they all evolve over time. This suggests that the value of such derivative knowledge may decay over time, unless it is preserved by reacting to those changes. Our broad research goal is to develop mod- els, methods, and tools for selectively reacting to changes by balancing costs and benefits, i.e. through complete or partial re-computation of some of the underlying processes. In this paper we present an initial model for reasoning about change and re-computations, and show how analysis of detailed provenance of derived knowledge informs re-computation decisions. We illustrate the main ideas through a real-world case study in genomics, namely on the interpretation of hu- man variants in support of genetic diagnosis.},
	booktitle = {Proc. {TAPP}'16 ({Theory} and {Practice} of {Provenance})},
	publisher = {USENIX Association},
	author = {Missier, Paolo and Cala, Jacek and Wijaya, Eldarina},
	editor = {Cohen-Boulakia, Sarah},
	year = {2016},
	keywords = {\#provenance, \#re-computation, \#big data processing, \#data change},
}

Analyzing Provenance across Heterogeneous Provenance Graphs. Oliveira, W.; Missier, P.; Ocana, K.; de Oliveira, D.; and Braganholo, V. In Procs. IPAW 2016, Washington D.C., USA, 2016. Springer
link bibtex abstract

@inproceedings{oliveira_analyzing_2016,
	address = {Washington D.C., USA},
	title = {Analyzing {Provenance} across {Heterogeneous} {Provenance} {Graphs}},
	abstract = {Provenance generated by different workflow systems is generally ex- pressed using different formats. This is not an issue when scientists analyze provenance graphs in isolation, or when they use the same workflow system. However, when analyzing heterogeneous provenance graphs from multiple systems poses a challenge. To address this problem we adopt ProvONE as an integration model, and show how different provenance databases can be con- verted to a global ProvONE schema. Scientists can then query this integrated database, exploring and linking provenance across several different workflows that may represent different implementations of the same experiment. To illus- trate the feasibility of our approach, we developed conceptual mappings be- tween the provenance databases of two workflow systems (e-Science Central and SciCumulus). We provide cartridges that implement these mappings and generate an integrated provenance database expressed as Prolog facts. To demonstrate its usage, we have developed Prolog rules that enable scientists to query the integrated database.},
	booktitle = {Procs. {IPAW} 2016},
	publisher = {Springer},
	author = {Oliveira, Wellington and Missier, Paolo and Ocana, Kary and de Oliveira, Daniel and Braganholo, Vanessa},
	year = {2016},
	keywords = {\#provenance},
}

Tracking Dengue Epidemics using Twitter Content Classification and Topic Modelling. Missier, P.; Romanovsky, A; Miu, T; Pal, A; Daniilakis, M; Garcia, A; Cedrim, D; and Sousa, L In Procs. SoWeMine workshop, co-located with ICWE 2016, Lugano, Switzerland, 2016.

Paper link bibtex abstract

@inproceedings{missier_tracking_2016,
	address = {Lugano, Switzerland},
	title = {Tracking {Dengue} {Epidemics} using {Twitter} {Content} {Classification} and {Topic} {Modelling}},
	url = {http://arxiv.org/abs/1605.00968},
	abstract = {Detecting and preventing outbreaks of mosquito-borne diseases such as Dengue and Zika in Brasil and other tropical regions has long been a priority for governments in affected areas. Streaming social media content, such as Twit- ter, is increasingly being used for health vigilance applications such as flu detec- tion. However, previous work has not addressed the complexity of drastic sea- sonal changes on Twitter a across multiple epidemic outbreaks. In order to address this gap, this paper contrasts two complementary approaches to detecting Twitter content that is relevant for Dengue outbreak detection, namely supervised classification and unsupervised clustering using topic modelling. Each approach has benefits and shortcomings. Our classifier achieves a prediction accuracy of about 80\% based on a small training set of about 1,000 instances, but the need for manual annotation makes it hard to track seasonal changes in the nature of the epidemics, such as the emergence of new types of virus in certain geographical locations. In contrast, LDA-based topic modelling scales well, generating cohe- sive and well-separated clusters from larger samples. While clusters can be easily re-generated following changes in epidemics, however, this approach makes it hard to clearly segregate relevant tweets into well-defined clusters.},
	booktitle = {Procs. {SoWeMine} workshop, co-located with {ICWE} 2016},
	author = {Missier, Paolo and Romanovsky, A and Miu, T and Pal, A and Daniilakis, M and Garcia, A and Cedrim, D and Sousa, L},
	year = {2016},
	keywords = {\#social media analytics, \#twitter analytics},
}

Workload-aware streaming graph partitioning. Firth, H.; and Missier, P. In Procs. GraphQ Workshop, co-located with EDBT'16, Bordeaux, 2016.
link bibtex

@inproceedings{firth_workload-aware_2016,
	address = {Bordeaux},
	title = {Workload-aware streaming graph partitioning},
	booktitle = {Procs. {GraphQ} {Workshop}, co-located with {EDBT}'16},
	author = {Firth, Hugo and Missier, Paolo},
	year = {2016},
}

Data trajectories: tracking reuse of published data for transitive credit attribution. Missier, P. International Journal of Digital Curation, 11(1): 1–16. 2016.

Data trajectories: tracking reuse of published data for transitive credit attribution [link]

Paper doi link bibtex abstract

@article{missier_data_2016-1,
	title = {Data trajectories: tracking reuse of published data for transitive credit attribution},
	volume = {11},
	url = {http://bibbase.org/network/publication/missier-datatrajectoriestrackingreuseofpublisheddatafortransitivecreditattribution-2016},
	doi = {doi:10.2218/ijdc.v11i1.425},
	abstract = {The ability to measure the use and impact of published data sets is key to the success of the open data / open science paradigm. A direct measure of impact would require tracking data (re)use in the wild, which however is difficult to achieve. This is therefore commonly replaced by simpler metrics based on data download and citation counts. In this paper we describe a scenario where it is possible to track the trajectory of a dataset after its publication, and we show how this enables the design of accurate models for ascribing credit to data originators. A Data Trajectory (DT) is a graph that encodes knowledge of how, by whom, and in which context data has been re-used, possibly after several generations. We provide a theoretical model of DTs that is grounded in the W3C PROV data model for provenance, and we show how DTs can be used to automatically propagate a fraction of the credit associated with transitively derived datasets, back to original data contributors. We also show this model of transitive credit in action by means of a Data Reuse Simulator. Ultimately, our hope is that, in the longer term, credit models based on direct measures of data reuse will provide further incentives to data publication. We conclude by outlining a research agenda to address the hard questions of creating, collecting, and using DTs systematically across a large number of data reuse instances, in the wild.},
	number = {1},
	journal = {International Journal of Digital Curation},
	author = {Missier, Paolo},
	year = {2016},
	keywords = {provenance, data reuse, data trajectories},
	pages = {1--16},
}

Scalable and Efficient Whole-exome Data Processing Using Workflows on the Cloud. Cala, J.; Marei, E.; Yu, Y.; Takeda, K.; and Missier, P. Future Generation Computer Systems, In press(Special Issue: Big Data in the Cloud - Best paper award at the FGCS forum 2016). 2016.
link bibtex abstract

@article{cala_scalable_2016,
	title = {Scalable and {Efficient} {Whole}-exome {Data} {Processing} {Using} {Workflows} on the {Cloud}},
	volume = {In press},
	abstract = {Dataflow-style workflows offer a simple, high-level programming model for flexible prototyping of scientific applications as an attractive alternative to low-level scripting. At the same time, workflow management systems (WFMS) may support data parallelism over big datasets by providing scalable, distributed deployment and execution of the workflow over a cloud infrastructure. In theory, the combination of these properties makes workflows a natural choice for implementing Big Data processing pipelines, common for instance in bioinformatics. In practice, however, correct workflow design for parallel Big Data problems can be complex and very time-consuming. In this paper we present our experience in porting a genomics data processing pipeline from an existing scripted implementation deployed on a closed HPC cluster, to a workflow-based design deployed on the Microsoft Azure public cloud. We draw two contrasting and general conclusions from this project. On the positive side, we show that our solution based on the e-Science Central WFMS and deployed in the cloud clearly outperforms the original HPC-based implementation achieving up to 2.3x speed-up. However, in order to deliver such performance we describe the importance of optimising the workflow deployment model to best suit the characteristics of the cloud computing infrastructure. The main reason for the performance gains was the availability of fast, node-local SSD disks delivered by D-series Azure VMs combined with the implicit use of local disk resources by e-Science Central workflow engines. These conclusions suggest that, on parallel Big Data problems, it is important to couple understanding of the cloud computing architecture and its software stack with simplicity of design, and that further efforts in automating parallelisation of complex pipelines are required.},
	number = {Special Issue: Big Data in the Cloud - Best paper award at the FGCS forum 2016},
	journal = {Future Generation Computer Systems},
	author = {Cala, Jacek and Marei, Eyad and Yu, Yaobo and Takeda, Kenji and Missier, Paolo},
	year = {2016},
	keywords = {workflow, Performance analysis, Cloud computing, HPC, Whole-exome sequencing, Workflow-based application, cloud, genomics, ?},
}

2015 (4)

Bootstrapping Personalised Human Activity Recognition Models Using Online Active Learning. Miu, T.; Missier, P.; and Plötz, T. In Proceedings of the 14th IEEE International Conference on Ubiquitous Computing and Communications, 2015.
link bibtex

@inproceedings{miu_bootstrapping_2015,
	title = {Bootstrapping {Personalised} {Human} {Activity} {Recognition} {Models} {Using} {Online} {Active} {Learning}},
	booktitle = {Proceedings of the 14th {IEEE} {International} {Conference} on {Ubiquitous} {Computing} and {Communications}},
	author = {Miu, T. and Missier, P. and Plötz, T.},
	year = {2015},
}

SVI: a simple single-nucleotide Human Variant Interpretation tool for Clinical Use. Missier, P.; Wijaya, E.; Kirby, R.; and Keogh, M. In Procs. 11th International conference on Data Integration in the Life Sciences, Los Angeles, CA, 2015. Springer
link bibtex

@inproceedings{missier_svi_2015,
	address = {Los Angeles, CA},
	title = {{SVI}: a simple single-nucleotide {Human} {Variant} {Interpretation} tool for {Clinical} {Use}},
	booktitle = {Procs. 11th {International} conference on {Data} {Integration} in the {Life} {Sciences}},
	publisher = {Springer},
	author = {Missier, Paolo and Wijaya, Eldarina and Kirby, Ryan and Keogh, Michael},
	year = {2015},
	keywords = {\#NGS, \#variant interpretation},
}

Access control and view generation for provenance graphs. Danger, R.; Curcin, V.; Missier, P.; and Bryans, J. Future Generation Computer Systems, 49: 8–27. February 2015.

Access control and view generation for provenance graphs [link]

Paper doi link bibtex abstract

@article{danger_access_2015,
	title = {Access control and view generation for provenance graphs},
	volume = {49},
	issn = {0167739X},
	url = {http://www.sciencedirect.com/science/article/pii/S0167739X1500031X},
	doi = {10.1016/j.future.2015.01.014},
	abstract = {Data provenance refers to the knowledge about data sources and operations carried out to obtain some piece of data. A provenance-enabled system maintains record of the interoperation of processes across different modules, stages and authorities to capture the full lineage of the resulting data, and typically allows data-focused audits using semantic technologies, such as ontologies, that capture domain knowledge. However, regulating access to captured provenance data is a non-trivial problem, since execution records form complex, overlapping graphs with individual nodes possibly being subject to different access policies. Applying traditional access control to provenance queries can either hide from the user the entire graph with nodes that had access to them denied, reveal too much information, or return a semantically invalid graph. An alternative approach is to answer queries with a new graph that abstracts over the missing nodes and fragments. In this paper, we present TACLP, an access control language for provenance data that supports this approach, together with an algorithm that transforms graphs according to sets of access restrictions. The algorithm produces safe and valid provenance graphs that retain the maximum amount of information allowed by the security model. The approach is demonstrated on an example of restricting access to a clinical trial provenance trace.},
	journal = {Future Generation Computer Systems},
	author = {Danger, Roxana and Curcin, Vasa and Missier, Paolo and Bryans, Jeremy},
	month = feb,
	year = {2015},
	keywords = {Provenance, Access Control Language, Semantic Web},
	pages = {8--27},
}

Recent advances in Scalable Workflow Enactment Engines and Technologies. Hidders, J.; Missier, P.; and Sroka, J. Future Generation Computer Systems, 46: 1–2. May 2015.

Recent advances in Scalable Workflow Enactment Engines and Technologies [link]

Paper doi link bibtex

@article{hidders_recent_2015,
	title = {Recent advances in {Scalable} {Workflow} {Enactment} {Engines} and {Technologies}},
	volume = {46},
	issn = {0167739X},
	url = {http://www.sciencedirect.com/science/article/pii/S0167739X15000047},
	doi = {10.1016/j.future.2015.01.003},
	journal = {Future Generation Computer Systems},
	author = {Hidders, Jan and Missier, Paolo and Sroka, Jacek},
	month = may,
	year = {2015},
	pages = {1--2},
}

2014 (9)

On Strategies for Budget-based Online Annotation in Human Activity Recognition. Miu, T.; Plötz, T.; Missier, P.; and Roggen, D. In Proceedings of the 2014 ACM International Joint Conference on Pervasive and Ubiquitous Computing: Adjunct Publication, of UbiComp '14 Adjunct, pages 767–776, New York, NY, USA, 2014. ACM

On Strategies for Budget-based Online Annotation in Human Activity Recognition [link]

Paper doi link bibtex

@inproceedings{miu_strategies_2014,
	address = {New York, NY, USA},
	series = {{UbiComp} '14 {Adjunct}},
	title = {On {Strategies} for {Budget}-based {Online} {Annotation} in {Human} {Activity} {Recognition}},
	isbn = {978-1-4503-3047-3},
	url = {http://doi.acm.org/10.1145/2638728.2641300},
	doi = {10.1145/2638728.2641300},
	booktitle = {Proceedings of the 2014 {ACM} {International} {Joint} {Conference} on {Pervasive} and {Ubiquitous} {Computing}: {Adjunct} {Publication}},
	publisher = {ACM},
	author = {Miu, Tudor and Plötz, Thomas and Missier, Paolo and Roggen, Daniel},
	year = {2014},
	keywords = {activity recognition, online learning, budget-based annotation},
	pages = {767--776},
}

Forget Dimensions: Define Your Information Quality Using Quality View Patterns. Embury, S.; and Missier, P. In Floridi, L.; and Illari, P., editor(s), The Philosophy of Information Quality SE - 3, volume 358, of Synthese Library, pages 25–41. Springer International Publishing, 2014.

Forget Dimensions: Define Your Information Quality Using Quality View Patterns [link]

Paper doi link bibtex

@incollection{embury_forget_2014,
	series = {Synthese {Library}},
	title = {Forget {Dimensions}: {Define} {Your} {Information} {Quality} {Using} {Quality} {View} {Patterns}},
	volume = {358},
	isbn = {978-3-319-07120-6},
	url = {http://dx.doi.org/10.1007/978-3-319-07121-3_3},
	language = {English},
	booktitle = {The {Philosophy} of {Information} {Quality} {SE} - 3},
	publisher = {Springer International Publishing},
	author = {Embury, SuzanneM. and Missier, Paolo},
	editor = {Floridi, Luciano and Illari, Phyllis},
	year = {2014},
	doi = {10.1007/978-3-319-07121-3_3},
	keywords = {\#information quality},
	pages = {25--41},
}

Measuring the impact of cognitive distractions on driving performance using time series analysis. Garcia-Constantino, M.; Missier, P.; and Guo, P. B. A. W. In Procs. IEEE conference on Intelligent Transport Systems (ITSC'14), August 2014.

Measuring the impact of cognitive distractions on driving performance using time series analysis [link]

Paper link bibtex abstract

@inproceedings{garcia-constantino_measuring_2014,
	title = {Measuring the impact of cognitive distractions on driving performance using time series analysis},
	url = {http://arxiv.org/abs/1408.5573},
	abstract = {Using current sensing technology, a wealth of data on driving sessions is potentially available through a combination of vehicle sensors and drivers' physiology sensors (heart rate, breathing rate, skin temperature, etc.). Our hypothesis is that it should be possible to exploit the combination of time series produced by such multiple sensors during a driving session, in order to (i) learn models of normal driving behaviour, and (ii) use such models to detect important and potentially dangerous deviations from the norm in real-time, and thus enable the generation of appropriate alerts. Crucially, we believe that such models and interventions should and can be personalised and tailor-made for each individual driver. As an initial step towards this goal, in this paper we present techniques for assessing the impact of cognitive distraction on drivers, based on simple time series analysis. We have tested our method on a rich dataset of driving sessions, carried out in a professional simulator, involving a panel of volunteer drivers. Each session included a different type of cognitive distraction, and resulted in multiple time series from a variety of on-board sensors as well as sensors worn by the driver. Crucially, each driver also recorded an initial session with no distractions. In our model, such initial session provides the baseline times series that make it possible to quantitatively assess driver performance under distraction conditions.},
	booktitle = {Procs. {IEEE} conference on {Intelligent} {Transport} {Systems} ({ITSC}'14)},
	author = {Garcia-Constantino, Matias and Missier, Paolo and Guo, Phil Blytheand Amy Weihong},
	month = aug,
	year = {2014},
	keywords = {\#ITS},
}

Tweet My Street: A Cross-Disciplinary Collaboration for the Analysis of Local Twitter Data. Mearns, G.; Simmonds, R.; Richardson, R.; Turner, M.; Watson, P.; and Missier, P. Future Internet, 6(2): 378–396. 2014.

Paper doi link bibtex 1 download

@article{mearns_tweet_2014,
	title = {Tweet {My} {Street}: {A} {Cross}-{Disciplinary} {Collaboration} for the {Analysis} of {Local} {Twitter} {Data}},
	volume = {6},
	issn = {1999-5903},
	url = {http://www.mdpi.com/1999-5903/6/2/378},
	doi = {10.3390/fi6020378},
	number = {2},
	journal = {Future Internet},
	author = {Mearns, Graeme and Simmonds, Rebecca and Richardson, Ranald and Turner, Mark and Watson, Paul and Missier, Paolo},
	year = {2014},
	pages = {378--396},
}

ProvGen: generating synthetic PROV graphs with predictable structure. Firth, H.; and Missier, P. In Procs. IPAW 2014 (Provenance and Annotations), Koln, Germany, 2014. Springer
link bibtex 12 downloads

@inproceedings{firth_provgen_2014,
	address = {Koln, Germany},
	title = {{ProvGen}: generating synthetic {PROV} graphs with predictable structure},
	booktitle = {Procs. {IPAW} 2014 ({Provenance} and {Annotations})},
	publisher = {Springer},
	author = {Firth, Hugo and Missier, Paolo},
	year = {2014},
}

ProvAbs: model, policy, and tooling for abstracting PROV graphs. Missier, P.; Bryans, J.; Gamble, C.; Curcin, V.; and Danger, R. In Procs. IPAW 2014 (Provenance and Annotations), Koln, Germany, 2014. Springer
link bibtex 8 downloads

@inproceedings{missier_provabs_2014,
	address = {Koln, Germany},
	title = {{ProvAbs}: model, policy, and tooling for abstracting {PROV} graphs},
	booktitle = {Procs. {IPAW} 2014 ({Provenance} and {Annotations})},
	publisher = {Springer},
	author = {Missier, Paolo and Bryans, Jeremy and Gamble, Carl and Curcin, Vasa and Danger, Roxana},
	year = {2014},
}

From scripted HPC-based NGS pipelines to workflows on the cloud. Cala, J.; Xu, Y. X.; Wijaya, E. A.; and Missier, P. In Procs. C4Bio workshop, co-located with the 2014 CCGrid conference, Chicago, IL, 2014. IEEE
link bibtex

@inproceedings{cala_scripted_2014,
	address = {Chicago, IL},
	title = {From scripted {HPC}-based {NGS} pipelines to workflows on the cloud},
	booktitle = {Procs. {C4Bio} workshop, co-located with the 2014 {CCGrid} conference},
	publisher = {IEEE},
	author = {Cala, Jacek and Xu, Yaobo Xu and Wijaya, Eldarina Azfar and Missier, Paolo},
	year = {2014},
	keywords = {workflow, scientific workflows, NGS, pipeline},
}

The PBase Scientific Workflow Provenance Repository. Cuevas-Vicenttín, V.; Kianmajd, P.; Ludäscher, B.; Missier, P.; Chirigati, F.; Wei, Y.; Koop, D.; and Dey, S. In Procs. 9th International Digital Curation Conference, San Francisco, CA, USA, 2014.
link bibtex

@inproceedings{cuevas-vicenttin_pbase_2014,
	address = {San Francisco, CA, USA},
	title = {The {PBase} {Scientific} {Workflow} {Provenance} {Repository}},
	booktitle = {Procs. 9th {International} {Digital} {Curation} {Conference}},
	author = {Cuevas-Vicenttín, Víctor and Kianmajd, Parisa and Ludäscher, Bertram and Missier, Paolo and Chirigati, Fernando and Wei, Yaxing and Koop, David and Dey, Saumen},
	year = {2014},
	keywords = {\#provenance, \#workflow, \#DataONE},
}

Distilling structure in Taverna scientific workflows: a refactoring approach. Cohen-Boulakia, S.; Chen, J.; Missier, P.; Goble, C.; Williams, A.; and Froidevaux, C. BMC Bioinformatics, 15(Suppl 1): S12. 2014.

Distilling structure in Taverna scientific workflows: a refactoring approach [link]

Paper doi link bibtex abstract 2 downloads

@article{cohen-boulakia_distilling_2014,
	title = {Distilling structure in {Taverna} scientific workflows: a refactoring approach},
	volume = {15},
	issn = {1471-2105},
	url = {http://www.biomedcentral.com/1471-2105/15/S1/S12},
	doi = {10.1186/1471-2105-15-S1-S12},
	abstract = {BACKGROUND:Scientific workflows management systems are increasingly used to specify and manage bioinformatics experiments. Their programming model appeals to bioinformaticians, who can use them to easily specify complex data processing pipelines. Such a model is underpinned by a graph structure, where nodes represent bioinformatics tasks and links represent the dataflow. The complexity of such graph structures is increasing over time, with possible impacts on scientific workflows reuse. In this work, we propose effective methods for workflow design, with a focus on the Taverna model. We argue that one of the contributing factors for the difficulties in reuse is the presence of "anti-patterns", a term broadly used in program design, to indicate the use of idiomatic forms that lead to over-complicated design. The main contribution of this work is a method for automatically detecting such anti-patterns, and replacing them with different patterns which result in a reduction in the workflow's overall structural complexity. Rewriting workflows in this way will be beneficial both in terms of user experience (easier design and maintenance), and in terms of operational efficiency (easier to manage, and sometimes to exploit the latent parallelism amongst the tasks).RESULTS:We have conducted a thorough study of the workflows structures available in Taverna, with the aim of finding out workflow fragments whose structure could be made simpler without altering the workflow semantics. We provide four contributions. Firstly, we identify a set of anti-patterns that contribute to the structural workflow complexity. Secondly, we design a series of refactoring transformations to replace each anti-pattern by a new semantically-equivalent pattern with less redundancy and simplified structure. Thirdly, we introduce a distilling algorithm that takes in a workflow and produces a distilled semantically-equivalent workflow. Lastly, we provide an implementation of our refactoring approach that we evaluate on both the public Taverna workflows and on a private collection of workflows from the BioVel project.CONCLUSION:We have designed and implemented an approach to improving workflow structure by way of rewriting preserving workflow semantics. Future work includes considering our refactoring approach during the phase of workflow design and proposing guidelines for designing distilled workflows.},
	number = {Suppl 1},
	journal = {BMC Bioinformatics},
	author = {Cohen-Boulakia, Sarah and Chen, Jiuqiang and Missier, Paolo and Goble, Carole and Williams, Alan and Froidevaux, Christine},
	year = {2014},
	keywords = {\#workflow, \#taverna},
	pages = {S12},
}

BACKGROUND:Scientific workflows management systems are increasingly used to specify and manage bioinformatics experiments. Their programming model appeals to bioinformaticians, who can use them to easily specify complex data processing pipelines. Such a model is underpinned by a graph structure, where nodes represent bioinformatics tasks and links represent the dataflow. The complexity of such graph structures is increasing over time, with possible impacts on scientific workflows reuse. In this work, we propose effective methods for workflow design, with a focus on the Taverna model. We argue that one of the contributing factors for the difficulties in reuse is the presence of "anti-patterns", a term broadly used in program design, to indicate the use of idiomatic forms that lead to over-complicated design. The main contribution of this work is a method for automatically detecting such anti-patterns, and replacing them with different patterns which result in a reduction in the workflow's overall structural complexity. Rewriting workflows in this way will be beneficial both in terms of user experience (easier design and maintenance), and in terms of operational efficiency (easier to manage, and sometimes to exploit the latent parallelism amongst the tasks).RESULTS:We have conducted a thorough study of the workflows structures available in Taverna, with the aim of finding out workflow fragments whose structure could be made simpler without altering the workflow semantics. We provide four contributions. Firstly, we identify a set of anti-patterns that contribute to the structural workflow complexity. Secondly, we design a series of refactoring transformations to replace each anti-pattern by a new semantically-equivalent pattern with less redundancy and simplified structure. Thirdly, we introduce a distilling algorithm that takes in a workflow and produces a distilled semantically-equivalent workflow. Lastly, we provide an implementation of our refactoring approach that we evaluate on both the public Taverna workflows and on a private collection of workflows from the BioVel project.CONCLUSION:We have designed and implemented an approach to improving workflow structure by way of rewriting preserving workflow semantics. Future work includes considering our refactoring approach during the phase of workflow design and proposing guidelines for designing distilled workflows.

2013 (5)

Extracting PROV provenance traces from Wikipedia history pages. Missier, P.; and Chen, Z. In EDBT/ICDT Workshops, pages 327–330, 2013.
link bibtex 2 downloads

@inproceedings{missier_extracting_2013,
	title = {Extracting {PROV} provenance traces from {Wikipedia} history pages},
	booktitle = {{EDBT}/{ICDT} {Workshops}},
	author = {Missier, Paolo and Chen, Ziyu},
	year = {2013},
	pages = {327--330},
}

Fundamenta Informaticae – Special issue on Scalable Workflow Enactment Engines and Technology. Hidders, J.; Missier, P.; and Sroka, J., editors. Volume 128 IOS Press, 2013.

Fundamenta Informaticae – Special issue on Scalable Workflow Enactment Engines and Technology [link]

Paper link bibtex

@book{hidders_fundamenta_2013,
	title = {Fundamenta {Informaticae} – {Special} issue on {Scalable} {Workflow} {Enactment} {Engines} and {Technology}},
	volume = {128},
	url = {http://iospress.metapress.com/content/n8802x1448hr/?p=c2c17be2c8c64e1195aaa3c93db188c6&pi=1},
	number = {3},
	publisher = {IOS Press},
	editor = {Hidders, Jan and Missier, Paolo and Sroka, Jacek},
	year = {2013},
	keywords = {\#workflow, \#cloud},
}

Provenance and data differencing for workflow reproducibility analysis. Missier, P.; Woodman, S.; Hiden, H.; and Watson, P. Concurrency and Computation: Practice and Experience. 2013.

Provenance and data differencing for workflow reproducibility analysis [link]

Paper doi link bibtex abstract 8 downloads

@article{missier_provenance_2013,
	title = {Provenance and data differencing for workflow reproducibility analysis},
	issn = {1532-0634},
	url = {http://dx.doi.org/10.1002/cpe.3035},
	doi = {10.1002/cpe.3035},
	abstract = {One of the foundations of science is that researchers must publish the methodology used to achieve their results so that others can attempt to reproduce them. This has the added benefit of allowing methods to be adopted and adapted for other purposes. In the field of e-Science, services – often choreographed through workflow, process data to generate results. The reproduction of results is often not straightforward as the computational objects may not be made available or may have been updated since the results were generated. For example, services are often updated to fix bugs or improve algorithms. This paper addresses these problems in three ways. Firstly, it introduces a new framework to clarify the range of meanings of ‘reproducibility’. Secondly, it describes a new algorithm, PDIFF, that uses a comparison of workflow provenance traces to determine whether an experiment has been reproduced; the main innovation is that if this is not the case then the specific point(s) of divergence are identified through graph analysis, assisting any researcher wishing to understand those differences. One key feature is support for user-defined, semantic data comparison operators. Finally, the paper describes an implementation of PDIFF that leverages the power of the e-Science Central platform that enacts workflows in the cloud. As well as automatically generating a provenance trace for consumption by PDIFF, the platform supports the storage and reuse of old versions of workflows, data and services; the paper shows how this can be powerfully exploited to achieve reproduction and reuse. Copyright © 2013 John Wiley \& Sons, Ltd.},
	journal = {Concurrency and Computation: Practice and Experience},
	author = {Missier, Paolo and Woodman, Simon and Hiden, Hugo and Watson, Paul},
	year = {2013},
	keywords = {provenance, reproducibility, e-science, scientific workflow},
}

The W3C PROV family of specifications for modelling provenance metadata. Missier, P.; Belhajjame, K.; and Cheney, J. In Procs. EDBT'13 (Tutorial), Genova, Italy, 2013. ACM

The W3C PROV family of specifications for modelling provenance metadata [pdf]

Paper link bibtex abstract 2 downloads

@inproceedings{missier_w3c_2013,
	address = {Genova, Italy},
	title = {The {W3C} {PROV} family of specifications for modelling provenance metadata},
	url = {http://www.edbt.org/Proceedings/2013-Genova/papers/edbt/a80-missier.pdf},
	abstract = {Provenance, a form of structured metadata designed to record the origin or source of information, can be instrumental in deciding whether information is to be trusted, how it can be integrated with other diverse information sources, and how to establish attribution of information to authors through- out its history. The PROV set of speci cations, produced by the World Wide Web Consortium (W3C), is designed to pro- mote the publication of provenance information on the Web, and o ers a basis for interoperability across diverse prove- nance management systems. The PROV provenance model is deliberately generic and domain-agnostic, but extension mechanisms are available and can be exploited for modelling speci c domains. This tutorial provides an account of these speci cations. Starting from intuitive and informal exam- ples that present idiomatic provenance patterns, it progres- sively introduces the relational model of provenance along with the constraints model for validation of provenance doc- uments, and concludes with example applications that show the extension points in use.},
	booktitle = {Procs. {EDBT}'13 ({Tutorial})},
	publisher = {ACM},
	author = {Missier, Paolo and Belhajjame, Khalid and Cheney, James},
	year = {2013},
	keywords = {\#provenance, \#PROV},
}

D-PROV: extending the PROV provenance model with workflow structure. Missier, P.; Dey, S.; Belhajjame, K.; Cuevas, V.; and Ludaescher, B. In Procs. TAPP'13, Lombard, IL, 2013.
link bibtex 6 downloads

@inproceedings{missier_d-prov_2013,
	address = {Lombard, IL},
	title = {D-{PROV}: extending the {PROV} provenance model with workflow structure},
	booktitle = {Procs. {TAPP}'13},
	author = {Missier, Paolo and Dey, Saumen and Belhajjame, Khalid and Cuevas, Victor and Ludaescher, Bertram},
	year = {2013},
	keywords = {PROV, workflow-provenance},
}

2012 (8)

Report from the first workshop on Scalable Workflow Enactment Engines and Technology (SWEET'12). Hidders, J.; Sroka, J.; and Missier, P. In SIGMOD Record, volume 41. December 2012.

Report from the first workshop on Scalable Workflow Enactment Engines and Technology (SWEET'12) [pdf]

Paper link bibtex

@incollection{hidders_report_2012,
	title = {Report from the first workshop on {Scalable} {Workflow} {Enactment} {Engines} and {Technology} ({SWEET}'12)},
	volume = {41},
	url = {http://www.sigmod.org/publications/sigmod-record/1212/pdfs/12.report.hidders.pdf},
	booktitle = {{SIGMOD} {Record}},
	author = {Hidders, Jan and Sroka, Jacek and Missier, Paolo},
	month = dec,
	year = {2012},
}

SWEET '12: Proceedings of the 1st ACM SIGMOD Workshop on Scalable Workflow Execution Engines and Technologies. Hidders, J.; Missier, P.; and Sroka, J., editors. SIGMOD Record, New York, NY, USA, December 2012.

SWEET '12: Proceedings of the 1st ACM SIGMOD Workshop on Scalable Workflow Execution Engines and Technologies [link]

Paper link bibtex 1 download

@book{hidders_sweet_2012,
	address = {New York, NY, USA},
	title = {{SWEET} '12: {Proceedings} of the 1st {ACM} {SIGMOD} {Workshop} on {Scalable} {Workflow} {Execution} {Engines} and {Technologies}},
	isbn = {978-1-4503-1876-1},
	url = {http://dl.acm.org/citation.cfm?id=2443416},
	publisher = {SIGMOD Record},
	editor = {Hidders, Jan and Missier, Paolo and Sroka, Jacek},
	month = dec,
	year = {2012},
}

Predicting the Execution Time of Workflow Activities Based on Their Input Features. Miu, T.; and Missier, P. In Taylor, I.; and Montagnat, J., editor(s), Procs. WORKS 2012, Salt Lake City, US, 2012. ACM
link bibtex 3 downloads

@inproceedings{miu_predicting_2012,
	address = {Salt Lake City, US},
	title = {Predicting the {Execution} {Time} of {Workflow} {Activities} {Based} on {Their} {Input} {Features}},
	booktitle = {Procs. {WORKS} 2012},
	publisher = {ACM},
	author = {Miu, Tudor and Missier, Paolo},
	editor = {Taylor, Ian and Montagnat, Johan},
	year = {2012},
}

Modelling Provenance using Structured Occurrence Networks. Missier, P; Randell, B; and Koutny, M In Proc. IPAW'12, Santa Barbara, California, 2012. Springer-Verlag, Lecture Notes in Computer Science
link bibtex 7 downloads

@inproceedings{missier_modelling_2012,
	address = {Santa Barbara, California},
	title = {Modelling {Provenance} using {Structured} {Occurrence} {Networks}},
	booktitle = {Proc. {IPAW}'12},
	publisher = {Springer-Verlag, Lecture Notes in Computer Science},
	author = {Missier, P and Randell, B and Koutny, M},
	year = {2012},
	keywords = {\#provenance},
}

A PROV encoding for provenance analysis using deductive rules. Missier, P; and Belhajjame, K. In Procs. IPAW'12, Santa Barbara, California, 2012. Springer-Verlag, Lecture Notes in Computer Science
link bibtex

@inproceedings{missier_prov_2012,
	address = {Santa Barbara, California},
	title = {A {PROV} encoding for provenance analysis using deductive rules},
	booktitle = {Procs. {IPAW}'12},
	publisher = {Springer-Verlag, Lecture Notes in Computer Science},
	author = {Missier, P and Belhajjame, K.},
	year = {2012},
	keywords = {\#provenance, \#PROV, \#datalog},
}

Detecting Duplicate Records in Scientific Workflow Results. Belhajjame, K.; Missier, P; and Goble, C In Procs. IPAW'12, Santa Barbara, California, 2012. Springer-Verlag, Lecture Notes in Computer Science
link bibtex

@inproceedings{belhajjame_detecting_2012,
	address = {Santa Barbara, California},
	title = {Detecting {Duplicate} {Records} in {Scientific} {Workflow} {Results}},
	booktitle = {Procs. {IPAW}'12},
	publisher = {Springer-Verlag, Lecture Notes in Computer Science},
	author = {Belhajjame, K. and Missier, P and Goble, C},
	year = {2012},
	keywords = {\#provenance, \#PROV},
}

Principles of Provenance (Dagstuhl Seminar 12091). Cheney, J.; Finkelstein, A.; Ludaescher, B.; and Vansummeren, S. Dagstuhl Reports, 2(2): 84–113. 2012.

Principles of Provenance (Dagstuhl Seminar 12091) [link]

Paper doi link bibtex 1 download

@article{cheney_principles_2012,
	title = {Principles of {Provenance} ({Dagstuhl} {Seminar} 12091)},
	volume = {2},
	issn = {2192-5283},
	url = {http://drops.dagstuhl.de/opus/volltexte/2012/3507},
	doi = {http://dx.doi.org/10.4230/DagRep.2.2.84},
	number = {2},
	journal = {Dagstuhl Reports},
	author = {Cheney, James and Finkelstein, Anthony and Ludaescher, Bertram and Vansummeren, Stijn},
	editor = {Cheney, James and Finkelstein, Anthony and Ludaescher, Bertram and Vansummeren, Stijn},
	year = {2012},
	keywords = {\#provenance},
	pages = {84--113},
}

Golden Trail: Retrieving the Data History that Matters from a Comprehensive Provenance Repository. Missier, P.; Ludascher, B.; Bowers, S.; Altintas, I.; Dey, S.; and Agun, M. International Journal of Digital Curation, 7(1). 2012.

Golden Trail: Retrieving the Data History that Matters from a Comprehensive Provenance Repository [link]

Paper link bibtex 1 download

@article{missier_golden_2012,
	title = {Golden {Trail}: {Retrieving} the {Data} {History} that {Matters} from a {Comprehensive} {Provenance} {Repository}},
	volume = {7},
	url = {http://www.dcc.ac.uk/events/idcc11},
	number = {1},
	journal = {International Journal of Digital Curation},
	author = {Missier, Paolo and Ludascher, Bertram and Bowers, Shawn and Altintas, Ilkay and Dey, Saumen and Agun, Michael},
	year = {2012},
	keywords = {\#provenance, \#workflow, \#repository},
}

2011 (8)

Achieving Reproducibility by Combining Provenance with Service and Workflow Versioning. Woodman, S.; Hiden, H.; Watson, P.; and Missier, P. In Procs. WORKS 2011, Seattle, WA, USA, 2011.
link bibtex

@inproceedings{woodman_achieving_2011,
	address = {Seattle, WA, USA},
	title = {Achieving {Reproducibility} by {Combining} {Provenance} with {Service} and {Workflow} {Versioning}},
	booktitle = {Procs. {WORKS} 2011},
	author = {Woodman, Simon and Hiden, Hugo and Watson, Paul and Missier, Paolo},
	year = {2011},
	keywords = {provenance, workflow, cloud, reproducible science},
}

Workflows for Information Integration in the Life Sciences. Missier, P.; Paton, N.; and Li, P. In Ceri, S.; and Brambilla, M., editor(s), Search Computing, volume 6585, of Lecture Notes in Computer Science, pages 215–225. Springer Berlin / Heidelberg, 2011.

Workflows for Information Integration in the Life Sciences [link]

Paper link bibtex abstract

@incollection{missier_workflows_2011,
	series = {Lecture {Notes} in {Computer} {Science}},
	title = {Workflows for {Information} {Integration} in the {Life} {Sciences}},
	volume = {6585},
	isbn = {978-3-642-19667-6},
	url = {http://dx.doi.org/10.1007/978-3-642-19668-3_20},
	abstract = {The increasingly computationally- and data-intensive nature of experimental science motivates recent interest in workflows, as a way to specify complex data processing and integration pipelines in a fairly intuitive way. Such workflows orchestrate the invocation of data retrieval services in a way that resembles, to some extent, Search Computing query plans. While the former are manually specified, however, the latter are the result of an automated translation process. Using lessons learnt from experience in workflow design, in this chapter we discuss some of the requirements on service curation that make automated, on-demand data integration processes possible and realistic.},
	booktitle = {Search {Computing}},
	publisher = {Springer Berlin / Heidelberg},
	author = {Missier, Paolo and Paton, Norman and Li, Peter},
	editor = {Ceri, Stefano and Brambilla, Marco},
	year = {2011},
	pages = {215--225},
}

Towards the preservation of scientific workflows. Roure, D. D.; Belhajjame, K.; Missier, P.; and Al., E. In Procs. of the 8th International Conference on Preservation of Digital Objects (iPRES 2011), Singapore, 2011.
link bibtex

@inproceedings{roure_towards_2011,
	address = {Singapore},
	title = {Towards the preservation of scientific workflows},
	booktitle = {Procs. of the 8th {International} {Conference} on {Preservation} of {Digital} {Objects} ({iPRES} 2011)},
	author = {Roure, David De and Belhajjame, Khalid and Missier, Paolo and Al., Et},
	year = {2011},
}

Why linked data is not enough for scientists. Bechhofer, S.; Buchan, I.; De Roure, D.; Missier, P.; and Al., E. Future Generation Computer Systems (FGCS). 2011.

Why linked data is not enough for scientists [link]

Paper doi link bibtex 2 downloads

@article{bechhofer_why_2011,
	title = {Why linked data is not enough for scientists},
	url = {http://www.sciencedirect.com/science/article/pii/S0167739X11001439},
	doi = {doi:10.1016/j.future.2011.08.004},
	journal = {Future Generation Computer Systems (FGCS)},
	author = {Bechhofer, Sean and Buchan, Iain and De Roure, David and Missier, Paolo and Al., Et},
	year = {2011},
}

Incremental workflow improvement through analysis of its data provenance. Missier, P. In Procs. TAPP'11 (Theory and Practice of Provenance), Heraklyion, Crete, Greece, June 2011.
link bibtex abstract

@inproceedings{missier_incremental_2011,
	address = {Heraklyion, Crete, Greece},
	title = {Incremental workflow improvement through analysis of its data provenance},
	abstract = {Repeated executions of resource-intensive workflows over a large number of runs are commonly observed in e-science practice. We explore the hypothesis that, in some cases, provenance traces recorded for past runs of a workflow can be used to make future runs more efficient. This investigation is an initial step into the systematic study of the role that provenance analysis can play in the broader context of self-managing software systems. We have tested our hypothesis on a concrete case study involving a Chemical Engineering workflow deployed on a cloud infrastructure, where we can measure the cost of its repeated execution. Our approach involves augmenting the workflow with a feedback loop in which incremental analysis of the provenance of past runs is used to control some of the workflow steps in subsequent executions. We present initial experimental results and hint at future improvements as part of ongoing work.},
	booktitle = {Procs. {TAPP}'11 ({Theory} and {Practice} of {Provenance})},
	author = {Missier, Paolo},
	month = jun,
	year = {2011},
}

Simulating Taverna workflows using stochastic process algebras. Curcin, V.; Missier, P.; and De Roure, D. Concurrency and Computation: Practice and Experience, In press.. 2011.
link bibtex

@article{curcin_simulating_2011,
	title = {Simulating {Taverna} workflows using stochastic process algebras},
	volume = {In press.},
	journal = {Concurrency and Computation: Practice and Experience},
	author = {Curcin, Vasa and Missier, Paolo and De Roure, David},
	year = {2011},
}

Workflows to Open Provenance Graphs, round-trip. Missier, P.; and Goble, C. Future Generation Computer Systems (FGCS), 27(6): 812–819. April 2011.
doi link bibtex abstract

@article{missier_workflows_2011-1,
	title = {Workflows to {Open} {Provenance} {Graphs}, round-trip},
	volume = {27},
	doi = {http://dx.doi.org/10.1016/j.future.2010.10.012},
	abstract = {The Open Provenance Model is designed to capture relationships amongst data values, and amongst processors that produce or consume those values. While OPM graphs are able to describe aspects of a workflow execution, capturing the structure of the workflows themselves is understandably beyond the scope of the OPM specification, since the graphs may be generated by a broad variety of processes, which may not be formal workflows at all. \% In particular, OPM does not address two questions: firstly, whether for any OPM graph there exists a \${\textbackslash}textbackslashbackslash\$textit\{plausible\} workflow, in some model, which could have generated the graph. And secondly, which information should be captured as part of an OPM graph that is derived from the execution of some known type of workflow, so that the workflow structure and the execution trace can both be inferred back from the graph. \% Motivated by the need to address the \${\textbackslash}textbackslashbackslash\$textit\{Third Provenance Challenge\} using Taverna workflows and provenance, in this paper we explore such notion of \${\textbackslash}textbackslashbackslash\$textit\{lossless-ness\} of OPM graphs relative to Taverna workflows. \% For the first question, we show that Taverna is a suitable model for representing plausible OPM-generating processes. For the second question, we show how augmenting OPM with two types of annotations makes it lossless with respect to Taverna. We support this claim by presenting a two-way mapping between OPM graphs and Taverna workflows.},
	number = {6},
	journal = {Future Generation Computer Systems (FGCS)},
	author = {Missier, Paolo and Goble, Carole},
	month = apr,
	year = {2011},
	keywords = {\#provenance, \#OPM, OPM},
	pages = {812--819},
}

Extending Semantic Provenance into the Web of Data. Zhao, J.; Sahoo, S. S; Missier, P.; Sheth, A.; and Goble, C. IEEE Internet Computing, 15: 40–48. 2011.

Extending Semantic Provenance into the Web of Data [link]

Paper doi link bibtex

@article{zhao_extending_2011,
	title = {Extending {Semantic} {Provenance} into the {Web} of {Data}},
	volume = {15},
	issn = {1089-7801},
	url = {http://doi.ieeecomputersociety.org/10.1109/MIC.2011.7},
	doi = {http://doi.ieeecomputersociety.org/10.1109/MIC.2011.7},
	journal = {IEEE Internet Computing},
	author = {Zhao, Jun and Sahoo, Satya S and Missier, Paolo and Sheth, Amit and Goble, Carole},
	year = {2011},
	pages = {40--48},
}

2010 (11)

Linking Multiple Workflow Provenance Traces for Interoperable Collaborative Science. Missier, P.; Ludascher, B.; Bowers, S.; Anand, M. K.; Altintas, I.; Dey, S.; Sarkar, A.; Shrestha, B.; and Goble, C. In Proc.s 5th Workshop on Workflows in Support of Large-Scale Science (WORKS), 2010.
link bibtex abstract

@inproceedings{missier_linking_2010,
	title = {Linking {Multiple} {Workflow} {Provenance} {Traces} for {Interoperable} {Collaborative} {Science}},
	abstract = {Scientific collaboration increasingly involves data sharing between separate groups. We consider a scenario where data products of scientific workflows are published and then used by other researchers as inputs to their workflows. For proper interpretation, shared data must be complemented by descriptive metadata. We focus on provenance traces, a prime example of such metadata which describes the genesis and processing history of data products in terms of the computational workflow steps. Through the reuse of published data, virtual, implicitly collaborative experiments emerge, making it desirable to compose the independently generated traces into global ones that describe the combined executions as single, seamless experiments. We present a model for provenance sharing that realizes this holistic view by overcoming the various interoperability problems that emerge from the heterogeneity of workflow systems, data formats, and provenance models. At the heart lie (i) an abstract workflow and provenance model in which (ii) data sharing becomes itself part of the combined workflow. We then describe an implementation of our model that we developed in the context of the Data Observation Network for Earth (DataONE) project and that can “stitch together” traces from different Kepler and Taverna workflow runs. It provides a prototypical framework for seamless cross-system, collaborative provenance management and can be easily extended to include other systems. Our approach also opens the door to new ways of workflow interoperability not only through often elusive workflow standards but through shared provenance information from public repositories.},
	booktitle = {Proc.s 5th {Workshop} on {Workflows} in {Support} of {Large}-{Scale} {Science} ({WORKS})},
	author = {Missier, Paolo and Ludascher, Bertram and Bowers, Shawn and Anand, Manish Kumar and Altintas, Ilkay and Dey, Saumen and Sarkar, Anandarup and Shrestha, Biva and Goble, Carole},
	year = {2010},
}

Why Linked Data is Not Enough for Scientists. Bechhofer, S.; Ainsworth, J.; Bhagat, J.; Buchan, I.; Couch, P.; Cruickshank, D.; Roure, D. D.; Delderfield, M.; Dunlop, I.; Gamble, M.; Goble, C.; Michaelides, D.; Missier, P.; Owen, S.; Newman, D.; and Sufi, S. In e-Science (e-Science), 2010 IEEE Sixth International Conference on, pages 300–307, 2010.

Paper doi link bibtex

@inproceedings{bechhofer_why_2010,
	title = {Why {Linked} {Data} is {Not} {Enough} for {Scientists}},
	url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5693931},
	doi = {10.1109/eScience.2010.21},
	booktitle = {e-{Science} (e-{Science}), 2010 {IEEE} {Sixth} {International} {Conference} on},
	author = {Bechhofer, Sean and Ainsworth, John and Bhagat, Jiten and Buchan, Iain and Couch, Philip and Cruickshank, Don and Roure, David De and Delderfield, Mark and Dunlop, Ian and Gamble, Matthew and Goble, Carole and Michaelides, Danius and Missier, Paolo and Owen, Stuart and Newman, David and Sufi, Shoaib},
	year = {2010},
	pages = {300--307},
}

The Open Provenance Model — Core Specification (v1.1). Moreau, L.; Clifford, B.; Freire, J.; Futrelle, J.; Gil, Y.; Groth, P.; Kwasnikowska, N.; Miles, S.; Missier, P.; Myers, J.; Plale, B.; Simmhan, Y.; Stephan, E.; and Van Den Bussche, J. Future Generation Computer Systems. 2010.
doi link bibtex

@article{moreau_open_2010,
	title = {The {Open} {Provenance} {Model} — {Core} {Specification} (v1.1)},
	doi = {http://dx.doi.org/10.1016/j.future.2010.07.005},
	journal = {Future Generation Computer Systems},
	author = {Moreau, Luc and Clifford, Ben and Freire, Juliana and Futrelle, Joe and Gil, Yolanda and Groth, Paul and Kwasnikowska, Natalia and Miles, Simon and Missier, Paolo and Myers, Jim and Plale, Beth and Simmhan, Yogesh and Stephan, Eric and Van Den Bussche, Jan},
	year = {2010},
}

Seamless Provenance Representation and Use in Collaborative Science Scenarios (Abstract). Missier, P.; Ludascher, B.; Bowers, S.; Anand, M. K.; Altintas, I.; Dey, S.; Sarkar, A.; Shrestha, B.; and Goble, C. In AGU Fall Meeting, San Francisco, CA, USA, 2010.
link bibtex

@inproceedings{missier_seamless_2010,
	address = {San Francisco, CA, USA},
	title = {Seamless {Provenance} {Representation} and {Use} in {Collaborative} {Science} {Scenarios} ({Abstract})},
	booktitle = {{AGU} {Fall} {Meeting}},
	author = {Missier, Paolo and Ludascher, Bertram and Bowers, Shawn and Anand, Manish Kumar and Altintas, Ilkay and Dey, Saumen and Sarkar, Anandarup and Shrestha, Biva and Goble, Carole},
	year = {2010},
}

A comparison of using Taverna and BPEL in building scientific workflows: the case of caGrid. Tan, W.; Missier, P.; Foster, I.; Madduri, R.; De Roure, D.; and Goble, C. Concurrency and Computation: Practice and Experience, 22(9): 1098–1117. 2010.

A comparison of using Taverna and BPEL in building scientific workflows: the case of caGrid [link]

Paper doi link bibtex

@article{tan_comparison_2010,
	title = {A comparison of using {Taverna} and {BPEL} in building scientific workflows: the case of {caGrid}},
	volume = {22},
	issn = {1532-0634},
	url = {http://dx.doi.org/10.1002/cpe.1547},
	doi = {10.1002/cpe.1547},
	number = {9},
	journal = {Concurrency and Computation: Practice and Experience},
	author = {Tan, Wei and Missier, Paolo and Foster, Ian and Madduri, Ravi and De Roure, David and Goble, Carole},
	year = {2010},
	keywords = {scientific workflow, Taverna, BPEL, caGrid, functional programming},
	pages = {1098--1117},
}

Fine-grained and efficient lineage querying of collection-based workflow provenance. Missier, P.; Paton, N.; and Belhajjame, K. In Procs. EDBT, Lausanne, Switzerland, 2010.

Fine-grained and efficient lineage querying of collection-based workflow provenance [link]

Paper link bibtex 1 download

@inproceedings{missier_fine-grained_2010,
	address = {Lausanne, Switzerland},
	title = {Fine-grained and efficient lineage querying of collection-based workflow provenance},
	booktitle = {Procs. {EDBT}},
	url={https://doi.org/10.1145/1739041.1739079},
	author = {Missier, P. and Paton, N. and Belhajjame, K.},
	year = {2010},
}

ERGOT: A Semantic-based System for Service Discovery in Distributed Infrastructures. Pirro', G.; Trunfio, P.; Talia, D.; Missier, P.; and Goble, C. In Procs. CCGRID '10, Melbourne, Australia, 2010.
doi link bibtex

@inproceedings{pirro_ergot_2010,
	address = {Melbourne, Australia},
	title = {{ERGOT}: {A} {Semantic}-based {System} for {Service} {Discovery} in {Distributed} {Infrastructures}},
	doi = {http://dx.doi.org/10.1109/CCGRID.2010.24},
	booktitle = {Procs. {CCGRID} '10},
	author = {Pirro', Giuseppe and Trunfio, Paolo and Talia, Domenico and Missier, Paolo and Goble, Carole},
	year = {2010},
}

Functional Units: Abstractions for Web Service Annotations. Missier, P.; Wolstencroft, K.; Tanoh, F.; Li, P.; Bechhofer, S.; Belhajjame, K.; and Goble, C. In Procs. IEEE 2010 Fourth International Workshop on Scientific Workflows (SWF 2010), Miami, FL, 2010.

Functional Units: Abstractions for Web Service Annotations [link]

Paper link bibtex

@inproceedings{missier_functional_2010,
	address = {Miami, FL},
	title = {Functional {Units}: {Abstractions} for {Web} {Service} {Annotations}},
	url = {http://www.cs.wayne.edu/∼shiyong/swf/swf2010.html},
	booktitle = {Procs. {IEEE} 2010 {Fourth} {International} {Workshop} on {Scientific} {Workflows} ({SWF} 2010)},
	author = {Missier, Paolo and Wolstencroft, Katy and Tanoh, Franck and Li, Peter and Bechhofer, Sean and Belhajjame, Khalid and Goble, Carole},
	year = {2010},
	keywords = {service annotations biocatalogue},
}

Taverna, reloaded. Missier, P.; Soiland-Reyes, S.; Owen, S.; Tan, W.; Nenadic, A.; Dunlop, I.; Williams, A.; Oinn, T.; and Goble, C. In Gertz, M; Hey, T; and Ludaescher, B, editor(s), Procs. SSDBM 2010, Heidelberg, Germany, 2010.

Paper link bibtex 2 downloads

@inproceedings{missier_taverna_2010,
	address = {Heidelberg, Germany},
	title = {Taverna, reloaded},
	url = {https://link.springer.com/chapter/10.1007/978-3-642-13818-8_33},
	booktitle = {Procs. {SSDBM} 2010},
	author = {Missier, Paolo and Soiland-Reyes, Stian and Owen, Stuart and Tan, Wei and Nenadic, Alex and Dunlop, Ian and Williams, Alan and Oinn, Tom and Goble, Carole},
	editor = {Gertz, M and Hey, T and Ludaescher, B},
	year = {2010},
}

Janus: from Workflows to Semantic Provenance and Linked Open Data. Missier, P.; Sahoo, S. S; Zhao, J.; Sheth, A.; and Goble, C. In Procs. IPAW 2010, Troy, NY, 2010.
link bibtex

@inproceedings{missier_janus_2010,
	address = {Troy, NY},
	title = {Janus: from {Workflows} to {Semantic} {Provenance} and {Linked} {Open} {Data}},
	booktitle = {Procs. {IPAW} 2010},
	author = {Missier, Paolo and Sahoo, Satya S and Zhao, Jun and Sheth, Amit and Goble, Carole},
	year = {2010},
	keywords = {provenance Taverna LOD RDF semantics},
}

Understanding Collaborative Studies Through Interoperable Workflow Provenance. Altintas, I.; Anand, M. K.; Crawl, D.; Belloum, A.; Missier, P.; Goble, C.; and Sloot, P. In Procs. IPAW 2010, Troy, NY, 2010.
link bibtex

@inproceedings{altintas_understanding_2010,
	address = {Troy, NY},
	title = {Understanding {Collaborative} {Studies} {Through} {Interoperable} {Workflow} {Provenance}},
	booktitle = {Procs. {IPAW} 2010},
	author = {Altintas, Ilkay and Anand, Manish Kumar and Crawl, Daniel and Belloum, Adam and Missier, Paolo and Goble, Carole and Sloot, Peter},
	year = {2010},
	keywords = {provenance interoperability},
}

2009 (9)

Time-completeness trade-offs in record linkage using Adaptive Query Processing. Lengu, R; Missier, P; Fernandes, A A A; Guerrini, G; and Mesiti, M In Procs. EDBT, St. Petersburg, Russia, March 2009.

Time-completeness trade-offs in record linkage using Adaptive Query Processing [link]

Paper doi link bibtex

@inproceedings{lengu_time-completeness_2009,
	address = {St. Petersburg, Russia},
	title = {Time-completeness trade-offs in record linkage using {Adaptive} {Query} {Processing}},
	url = {http://dx.doi.org/10.1145/1516360.1516458},
	doi = {http://dx.doi.org/10.1145/1516360.1516458},
	booktitle = {Procs. {EDBT}},
	author = {Lengu, R and Missier, P and Fernandes, A A A and Guerrini, G and Mesiti, M},
	month = mar,
	year = {2009},
	keywords = {"Adaptive Query Processing", "Record Linkage"},
}

A Comparison of Using Taverna and BPEL in Building Scientific Workflows: the case of caGrid. Tan, W.; Missier, P.; Foster, I.; Madduri, R.; and Goble, C. Concurrency and Computation Practice and Experience. 2009.
link bibtex

@article{tan_comparison_2009,
	title = {A {Comparison} of {Using} {Taverna} and {BPEL} in {Building} {Scientific} {Workflows}: the case of {caGrid}},
	journal = {Concurrency and Computation Practice and Experience},
	author = {Tan, Wei and Missier, Paolo and Foster, Ian and Madduri, Ravi and Goble, Carole},
	year = {2009},
	keywords = {scientific workflow BPEL Taverna},
}

Combining DHTs and SONs for Semantic-Based Service Discovery. Pirro', G.; Missier, P.; Trunfio, P.; Talia, D.; Falace, G.; and Goble, C. In Procs.International Conference on Intelligent System Design and Applications (ISDA'09), Pisa, Italy, November 2009.
link bibtex

@inproceedings{pirro_combining_2009,
	address = {Pisa, Italy},
	title = {Combining {DHTs} and {SONs} for {Semantic}-{Based} {Service} {Discovery}},
	booktitle = {Procs.{International} {Conference} on {Intelligent} {System} {Design} and {Applications} ({ISDA}'09)},
	author = {Pirro', Giuseppe and Missier, Paolo and Trunfio, Paolo and Talia, Domenico and Falace, Gabriele and Goble, Carole},
	month = nov,
	year = {2009},
	keywords = {"Distributed Systems"},
}

Formal semantics for the Taverna 2 Workflow Model. Sroka, J.; Hidders, J.; Missier, P.; and Goble, C. Journal of Computer and System Sciences. 2009.

Formal semantics for the Taverna 2 Workflow Model [link]

Paper doi link bibtex abstract

@article{sroka_formal_2009,
	title = {Formal semantics for the {Taverna} 2 {Workflow} {Model}},
	url = {http://dx.doi.org/10.1016/j.jcss.2009.11.009},
	doi = {10.1016/j.jcss.2009.11.009},
	abstract = {This paper presents a formal semantics for the Taverna 2 scientific workflow system. Taverna 2 is a successor to Taverna, an open-source workflow system broadly adopted within the e-science community worldwide. The new version improves upon the existing model in two main ways: (i) by adding support for data pipelining, which in turns enables input streams of indefinite length to be processed efficiently; and (ii) by providing new extensibility points that make it possible to add new operators to the workflow model. Consistent with previous work by some of the authors, we use trace semantics to describe the effect of workflow computations, and we show how they can be used to describe the new features in the Taverna 2 model.},
	journal = {Journal of Computer and System Sciences},
	author = {Sroka, Jacek and Hidders, Jan and Missier, Paolo and Goble, Carole},
	year = {2009},
	keywords = {workflow model semantics},
}

The Data Playground: An Intuitive Workflow Specification Environment. Gibson, A; Gamble, M; Wolstencroft, K; Oinn, T; Goble, C; Belajjame, K; and Missier, P. Future Generation Computer Systems, 25: 453–459. April 2009.

The Data Playground: An Intuitive Workflow Specification Environment [link]

Paper doi link bibtex

@article{gibson_data_2009,
	title = {The {Data} {Playground}: {An} {Intuitive} {Workflow} {Specification} {Environment}},
	volume = {25},
	url = {http://dx.doi.org/10.1016/j.future.2008.09.009},
	doi = {http://dx.doi.org/10.1016/j.future.2008.09.009},
	journal = {Future Generation Computer Systems},
	author = {Gibson, A and Gamble, M and Wolstencroft, K and Oinn, T and Goble, C and Belajjame, K and Missier, Paolo},
	month = apr,
	year = {2009},
	keywords = {"Workflow Management", "e-Science"},
	pages = {453--459},
}

Semantically Annotated Provenance in the Life Science Grid. Cao, B; Plale, B; Subramanian, G; Missier, P; Goble, C; and Simmhan, Y In Freire, J.; Missier, P.; and Sahoo, S. S., editor(s), 1st International Workshop on the Role of Semantic Web in Provenance Management, 2009. CEUR Proceedings

Semantically Annotated Provenance in the Life Science Grid [link]

Paper link bibtex

@inproceedings{cao_semantically_2009,
	title = {Semantically {Annotated} {Provenance} in the {Life} {Science} {Grid}},
	url = {http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-526/},
	booktitle = {1st {International} {Workshop} on the {Role} of {Semantic} {Web} in {Provenance} {Management}},
	publisher = {CEUR Proceedings},
	author = {Cao, B and Plale, B and Subramanian, G and Missier, P and Goble, C and Simmhan, Y},
	editor = {Freire, Juliana and Missier, Paolo and Sahoo, Satya S.},
	year = {2009},
	keywords = {semantics provenance, \#provxg \#provenance},
}

Medical Image Processing Workflow Support on the EGEE Grid with Taverna. Maheshwari, K.; Missier, P.; Goble, C.; and Montagnat, J. In Procs. conference of Computer Based Medical Systems (CBMS), Albuquerque, NM, USA, 2009.
link bibtex

@inproceedings{maheshwari_medical_2009,
	address = {Albuquerque, NM, USA},
	title = {Medical {Image} {Processing} {Workflow} {Support} on the {EGEE} {Grid} with {Taverna}},
	booktitle = {Procs. conference of {Computer} {Based} {Medical} {Systems} ({CBMS})},
	author = {Maheshwari, Ketan and Missier, Paolo and Goble, Carole and Montagnat, Johan},
	year = {2009},
	keywords = {"Workflow Management", "e-Science"},
}

Data Provenance in Scientific Workflows. Belhajjame, K.; Missier, P.; and Goble, C. In Handbook of Research on Computational Grid Technologies for Life Sciences, Biomedicine, and Healthcare. IGI Global, 2009.
link bibtex

@incollection{belhajjame_data_2009,
	title = {Data {Provenance} in {Scientific} {Workflows}},
	booktitle = {Handbook of {Research} on {Computational} {Grid} {Technologies} for {Life} {Sciences}, {Biomedicine}, and {Healthcare}},
	publisher = {IGI Global},
	author = {Belhajjame, Khalid and Missier, Paolo and Goble, Carole},
	year = {2009},
	keywords = {Provenance, "Workflow Management"},
}

Incorporating Domain-Specific Information Quality Constraints into Database Queries. Embury, S. M; Missier, P.; Sampaio, S.; Greenwood, R M.; and Preece, A. D J. Data and Information Quality, 1(2). 2009.
link bibtex

@article{embury_incorporating_2009,
	title = {Incorporating {Domain}-{Specific} {Information} {Quality} {Constraints} into {Database} {Queries}},
	volume = {1},
	number = {2},
	journal = {J. Data and Information Quality},
	author = {Embury, Suzanne M and Missier, Paolo and Sampaio, Sandra and Greenwood, R Mark and Preece, Alun D},
	year = {2009},
}

2008 (7)

Building Scientific Workflow with Taverna and BPEL: a Comparative Study in caGrid. Tan, W.; Missier, P.; Madduri, R.; and Foster, I. In Procs.4th International workshop on Engineering Service-Oriented applications (WESOA), Sydney, Australia, December 2008.

Building Scientific Workflow with Taverna and BPEL: a Comparative Study in caGrid [link]

Paper doi link bibtex

@inproceedings{tan_building_2008,
	address = {Sydney, Australia},
	title = {Building {Scientific} {Workflow} with {Taverna} and {BPEL}: a {Comparative} {Study} in {caGrid}},
	url = {http://dx.doi.org/10.1007/978-3-642-01247-1_11},
	doi = {http://dx.doi.org/10.1007/978-3-642-01247-1_11},
	booktitle = {Procs.4th {International} workshop on {Engineering} {Service}-{Oriented} applications ({WESOA})},
	author = {Tan, Wei and Missier, Paolo and Madduri, Ravi and Foster, Ian},
	month = dec,
	year = {2008},
	keywords = {"e-Science", "Workflow Management "},
}

Scientific Workflows. C.Goble D. De Roure, P. In Yearbook of Science and Technology. McGraw Hill, 2008.
link bibtex

@incollection{cgoble_d_de_roure_scientific_2008,
	title = {Scientific {Workflows}},
	booktitle = {Yearbook of {Science} and {Technology}},
	publisher = {McGraw Hill},
	author = {C.Goble D. De Roure, P.Missier},
	year = {2008},
	keywords = {"Workflow Management", "e-Science"},
}

Exploiting provenance to make sense of automated data acceptance decisions in scientific workflows. Missier, P.; Embury, S.; and Stapenhurst, R. In IPAW, volume 5272/2008, of LNCS series, Salt Lake City, Utah, June 2008. Springer

Exploiting provenance to make sense of automated data acceptance decisions in scientific workflows [link]

Paper doi link bibtex

@inproceedings{missier_exploiting_2008,
	address = {Salt Lake City, Utah},
	series = {{LNCS} series},
	title = {Exploiting provenance to make sense of automated data acceptance decisions in scientific workflows},
	volume = {5272/2008},
	url = {http://www.springerlink.com/content/r07524068770k401/},
	doi = {http://dx.doi.org/10.1007/978-3-540-89965-5_19},
	booktitle = {{IPAW}},
	publisher = {Springer},
	author = {Missier, Paolo and Embury, Suzanne and Stapenhurst, Richard},
	month = jun,
	year = {2008},
	keywords = {"Information Quality Management", provenance data quality, \#provxg \#provenance \#qurator},
}

Information Quality in Proteomics. Stead, D; Paton, N; Missier, P; Embury, S; Hedeler, C; Jin, B; Brown, A; and Preece, A Briefings in Bioinformatics, 9: 174–188. March 2008.

Information Quality in Proteomics [link]

Paper link bibtex

@article{stead_information_2008,
	title = {Information {Quality} in {Proteomics}},
	volume = {9},
	url = {http://bib.oxfordjournals.org/cgi/reprint/9/2/174},
	journal = {Briefings in Bioinformatics},
	author = {Stead, D and Paton, N and Missier, P and Embury, S and Hedeler, C and Jin, B and Brown, A and Preece, A},
	month = mar,
	year = {2008},
	keywords = {"Information Quality Management", "e-Science", "Biological Information Management", information quality proteomics},
	pages = {174--188},
}

Data lineage model for Taverna workflows with lightweight annotation requirements. Missier, P; Belhajjame, K; Zhao, J; and Goble, C In IPAW, volume 5272/2008, of LNCS, Salt Lake City, US, June 2008. Springer

Data lineage model for Taverna workflows with lightweight annotation requirements [link]

Paper doi link bibtex 1 download

@inproceedings{missier_data_2008,
	address = {Salt Lake City, US},
	series = {{LNCS}},
	title = {Data lineage model for {Taverna} workflows with lightweight annotation requirements},
	volume = {5272/2008},
	url = {https://link.springer.com/chapter/10.1007/978-3-540-89965-5_4},
	doi = {http://dx.doi.org/10.1007/978-3-540-89965-5_4},
	booktitle = {{IPAW}},
	publisher = {Springer},
	author = {Missier, P and Belhajjame, K and Zhao, J and Goble, C},
	month = jun,
	year = {2008},
	keywords = {"Workflow Management ", \#provxg \#provenance, provenance lineage Taverna annotations},
}

Brokering infrastructure for minimum cost data procurement based on quality - quantity models. Avenali, A; Bertolazzi, P; Batini, C; and Missier, P Decision Support Systems, 45: 95–109. 2008.

Brokering infrastructure for minimum cost data procurement based on quality - quantity models [link]

Paper doi link bibtex

@article{avenali_brokering_2008,
	title = {Brokering infrastructure for minimum cost data procurement based on quality - quantity models},
	volume = {45},
	url = {http://dx.doi.org/10.1016/j.dss.2007.10.012},
	doi = {http://dx.doi.org/10.1016/j.dss.2007.10.012},
	journal = {Decision Support Systems},
	author = {Avenali, A and Bertolazzi, P and Batini, C and Missier, P},
	year = {2008},
	keywords = {"Distributed Query Processing", "Information Quality Management", "Optimization"},
	pages = {95--109},
}

An ontology-based approach to handling information quality in e-Science. Preece, A; Missier, P; Embury, S; Jin, B; and Greenwood, M Concurrency and Computation: Practice and Experience, 20: 253–264. 2008.

An ontology-based approach to handling information quality in e-Science [link]

Paper doi link bibtex

@article{preece_ontology-based_2008,
	title = {An ontology-based approach to handling information quality in e-{Science}},
	volume = {20},
	url = {http://dx.doi.org/10.1002/cpe.1195},
	doi = {http://dx.doi.org/10.1002/cpe.1195},
	journal = {Concurrency and Computation: Practice and Experience},
	author = {Preece, A and Missier, P and Embury, S and Jin, B and Greenwood, M},
	year = {2008},
	keywords = {\#qurator, "Information Quality Management", "Automated Reasoning", "e-Science", "Knowledge Representation"},
	pages = {253--264},
}

2007 (7)

Taverna Workflows: Syntax and Semantics. Turi, D; Missier, P; Roure, D D.; Goble, C; and Oinn, T In Proceedings of the 3rd e-Science conference, Bangalore, India, December 2007.

Taverna Workflows: Syntax and Semantics [link]

Paper doi link bibtex

@inproceedings{turi_taverna_2007,
	address = {Bangalore, India},
	title = {Taverna {Workflows}: {Syntax} and {Semantics}},
	url = {http://dx.doi.org/10.1109/E-SCIENCE.2007.71},
	doi = {http://dx.doi.org/10.1109/E-SCIENCE.2007.71},
	booktitle = {Proceedings of the 3rd e-{Science} conference},
	author = {Turi, D and Missier, P and Roure, D De and Goble, C and Oinn, T},
	month = dec,
	year = {2007},
	keywords = {Taverna, "Workflow Management", "Language semantics"},
}

Accelerating Disease Gene Identification Through Integrated SNP Data Analysis. Missier, P; Embury, S; Hedeler, C; Greenwood, M; Pennock, J; and Brass, A In Proceedings 4th International Workshop on Data Integration in the Life Sciences, of LNBI, pages 215–230, 2007. Springer

Accelerating Disease Gene Identification Through Integrated SNP Data Analysis [link]

Paper doi link bibtex 2 downloads

@inproceedings{missier_accelerating_2007,
	series = {{LNBI}},
	title = {Accelerating {Disease} {Gene} {Identification} {Through} {Integrated} {SNP} {Data} {Analysis}},
	url = {http://dx.doi.org/10.1007/978-3-540-73255-6_18},
	doi = {http://dx.doi.org/10.1007/978-3-540-73255-6_18},
	booktitle = {Proceedings 4th {International} {Workshop} on {Data} {Integration} in the {Life} {Sciences}},
	publisher = {Springer},
	author = {Missier, P and Embury, S and Hedeler, C and Greenwood, M and Pennock, J and Brass, A},
	year = {2007},
	keywords = {"Distributed Query Processing", "Information Quality Management", "Web Services", Biological Information Management},
	pages = {215--230},
}

Quality management challenges in the post-genomic era. Hedeler, C; and Missier, P In Database Modeling in Biology: Practices and Challenges. Artech House, 2007.
link bibtex

@incollection{hedeler_quality_2007,
	title = {Quality management challenges in the post-genomic era},
	booktitle = {Database {Modeling} in {Biology}: {Practices} and {Challenges}},
	publisher = {Artech House},
	author = {Hedeler, C and Missier, P},
	year = {2007},
	keywords = {"Information Quality Management", "Biological Information Management"},
}

Managing information quality in e-science: the Qurator workbench. Missier, P; Embury, S M; Greenwood, R M; Preece, A D; and Jin, B In SIGMOD '07: Proceedings of the 2007 ACM SIGMOD international conference on Management of data, pages 1150–1152, New York, NY, USA, 2007. ACM

Managing information quality in e-science: the Qurator workbench [link]

Paper doi link bibtex

@inproceedings{missier_managing_2007,
	address = {New York, NY, USA},
	title = {Managing information quality in e-science: the {Qurator} workbench},
	isbn = {978-1-59593-686-8},
	url = {http://dx.doi.org/10.1145/1247480.1247638},
	doi = {http://dx.doi.org/10.1145/1247480.1247638},
	booktitle = {{SIGMOD} '07: {Proceedings} of the 2007 {ACM} {SIGMOD} international conference on {Management} of data},
	publisher = {ACM},
	author = {Missier, P and Embury, S M and Greenwood, R M and Preece, A D and Jin, B},
	year = {2007},
	keywords = {\#qurator, "Workflow Management", "Information Quality Management", "e-Science"},
	pages = {1150--1152},
}

Architectural patterns for the Semantic Grid. Kotsiopoulos, I; Missier, P; Alper, P; Corcho, O; Bechhofer, S; and Goble, C In Talia, D; A.Bilas; and Dikaiakos, M, editor(s), CoreGRID Institute on Knowledge and Data Management, Poznan Workshop, September 2005, volume XVIII, of CoreGRID Series, Knowledge and Data Management in GRIDs. Springer, 2007.
link bibtex

@incollection{kotsiopoulos_architectural_2007,
	series = {{CoreGRID} {Series}, {Knowledge} and {Data} {Management} in {GRIDs}},
	title = {Architectural patterns for the {Semantic} {Grid}},
	volume = {XVIII},
	booktitle = {{CoreGRID} {Institute} on {Knowledge} and {Data} {Management}, {Poznan} {Workshop}, {September} 2005},
	publisher = {Springer},
	author = {Kotsiopoulos, I and Missier, P and Alper, P and Corcho, O and Bechhofer, S and Goble, C},
	editor = {Talia, D and {A.Bilas} and Dikaiakos, M},
	year = {2007},
	keywords = {"Semantic Grid"},
}

Grid Metadata Management: requirements and architecture. Corcho, O.; Alper, P.; Missier, P.; Bechhofer, S.; and Goble, C. In 8th ACM/IEEE International Conference on Grid Computing (\GRID\ 2007), Austin, Texas, September 2007.

Grid Metadata Management: requirements and architecture [link]

Paper doi link bibtex

@inproceedings{corcho_grid_2007,
	address = {Austin, Texas},
	title = {Grid {Metadata} {Management}: requirements and architecture},
	url = {http://dx.doi.org/10.1109/GRID.2007.4354121},
	doi = {http://dx.doi.org/10.1109/GRID.2007.4354121},
	booktitle = {8th {ACM}/{IEEE} {International} {Conference} on {Grid} {Computing} (\{{GRID}\} 2007)},
	author = {Corcho, Oscar and Alper, Pinar and Missier, Paolo and Bechhofer, Sean and Goble, Carole},
	month = sep,
	year = {2007},
	keywords = {"metadata management", "Semantic Grid", "Semantic Web Services"},
}

Requirements and services for Metadata Management. Missier, P; Alper, P; Corcho, O; Dunlop, I; and Goble, C IEEE internet Computing, (Special issue on Semantic-Based Knowledge Management). 2007.

Requirements and services for Metadata Management [link]

Paper doi link bibtex 1 download

@article{missier_requirements_2007,
	title = {Requirements and services for {Metadata} {Management}},
	url = {http://doi.ieeecomputersociety.org/10.1109/MIC.2007.113},
	doi = {http://doi.ieeecomputersociety.org/10.1109/MIC.2007.113},
	number = {Special issue on Semantic-Based Knowledge Management},
	journal = {IEEE internet Computing},
	author = {Missier, P and Alper, P and Corcho, O and Dunlop, I and Goble, C},
	year = {2007},
	keywords = {"metadata management", "Semantic Grid", "Semantic Web Services"},
}

2006 (6)

Towards the Management of Information Quality in Proteomics. Preece, A D; Jin, B; Missier, P; Embury, S M; Stead, D; and Brown, A In Proceedings of 19th IEEE International Symposium on Computer-Based Medical Systems (CBMS'06), pages 936–940, Salt Lake City, US, 2006. IEEE Computer Society Press

Towards the Management of Information Quality in Proteomics [link]

Paper doi link bibtex

@inproceedings{preece_towards_2006,
	address = {Salt Lake City, US},
	title = {Towards the {Management} of {Information} {Quality} in {Proteomics}},
	url = {http://dx.doi.org/10.1109/CBMS.2006.160},
	doi = {http://dx.doi.org/10.1109/CBMS.2006.160},
	booktitle = {Proceedings of 19th {IEEE} {International} {Symposium} on {Computer}-{Based} {Medical} {Systems} ({CBMS}'06)},
	publisher = {IEEE Computer Society Press},
	author = {Preece, A D and Jin, B and Missier, P and Embury, S M and Stead, D and Brown, A},
	year = {2006},
	keywords = {"Workflow Management", "Information Quality Management", "e-Science"},
	pages = {936--940},
}

An overview of S-OGSA: A Reference Semantic Grid Architecture. Corcho, O.; Alper, P.; Kotsiopoulos, I.; Missier, P.; Bechhofer, S.; and Goble, C. Journal of Web Semantics, 4(2): 102–115. 2006.

An overview of S-OGSA: A Reference Semantic Grid Architecture [pdf]

Paper doi link bibtex abstract

@article{corcho_overview_2006,
	title = {An overview of {S}-{OGSA}: {A} {Reference} {Semantic} {Grid} {Architecture}},
	volume = {4},
	issn = {15708268},
	url = {http://www.mendeley.com/download/public/212462/837355462/77dfb167a0b74918da2d3f9fb2ad72eb295e2260/dl.pdf},
	doi = {10.1016/j.websem.2006.03.001},
	abstract = {The Grid's vision, of sharing diverse resources in a flexible, coordinated and secure manner through dynamic formation and disbanding of virtual communities, strongly depends on metadata. Currently, Grid metadata is generated and used in an ad hoc fashion, much of it buried in the Grid middleware's code libraries and database schemas. This ad hoc expression and use of metadata causes chronic dependency on human intervention during the operation of Grid machinery, leading to systems which are brittle when faced with frequent syntactic changes in resource coordination and sharing protocols. The Semantic Grid is an extension of the Grid in which rich resource metadata is exposed and handled explicitly, and shared and managed via Grid protocols. The layering of an explicit semantic infrastructure over the Grid Infrastructure potentially leads to increased interoperability and greater flexibility. In recent years, several projects have embraced the Semantic Grid vision. However, the Semantic Grid lacks a Reference Architecture or any kind of systematic framework for designing Semantic Grid components or applications. The Open Grid Service Architecture (OGSA) aims to define a core set of capabilities and behaviours for Grid systems. We propose a Reference Architecture that extends OGSA to support the explicit handling of semantics, and defines the associated knowledge services to support a spectrum of service capabilities. Guided by a set of design principles, Semantic-OGSA (S-OGSA) defines a model, the capabilities and the mechanisms for the Semantic Grid. We conclude by highlighting the commonalities and differences that the proposed architecture has with respect to other Grid frameworks.},
	number = {2},
	journal = {Journal of Web Semantics},
	author = {Corcho, Oscar and Alper, Pinar and Kotsiopoulos, Ioannis and Missier, Paolo and Bechhofer, Sean and Goble, Carole},
	year = {2006},
	keywords = {Semantics, Grid, Semantic Grid, Architecture, Explicit metadata},
	pages = {102--115},
}

Managing Information Quality in e-Science Using Semantic Web Technology. Preece, A D; Jin, B; Pignotti, E; Missier, P; Embury, S M; Stead, D; and Brown, A In ESWC, pages 472–486, 2006.

Managing Information Quality in e-Science Using Semantic Web Technology. [link]

Paper doi link bibtex

@inproceedings{preece_managing_2006,
	title = {Managing {Information} {Quality} in e-{Science} {Using} {Semantic} {Web} {Technology}.},
	url = {http://dx.doi.org/10.1007/11762256_35},
	doi = {http://dx.doi.org/10.1007/11762256_35},
	booktitle = {{ESWC}},
	author = {Preece, A D and Jin, B and Pignotti, E and Missier, P and Embury, S M and Stead, D and Brown, A},
	year = {2006},
	keywords = {\#qurator, "Information Quality Management", "e-Science", "Knowledge Representation", "Semantic Web", Automated Reasoning},
	pages = {472--486},
}

Practical data quality certification: model, architecture, and experiences. Missier, P; Oliaro, A; and Raffa, S In IQIS, International Workshop on Information Quality in Information Systems, 30 June 2006, Chicago, USA (SIGMOD 2006 Workshop), 2006. ACM
link bibtex

@inproceedings{missier_practical_2006,
	title = {Practical data quality certification: model, architecture, and experiences},
	booktitle = {{IQIS}, {International} {Workshop} on {Information} {Quality} in {Information} {Systems}, 30 {June} 2006, {Chicago}, {USA} ({SIGMOD} 2006 {Workshop})},
	publisher = {ACM},
	author = {Missier, P and Oliaro, A and Raffa, S},
	year = {2006},
	keywords = {"Information Quality Management", "Data Quality"},
}

Quality Views: Capturing and Exploiting the User Perspective on Data Quality. Missier, P; Embury, S M; Greenwood, M; Preece, A D; and Jin, B In Procs. VLDB, pages 977–988, Seoul, Korea, September 2006.

Quality Views: Capturing and Exploiting the User Perspective on Data Quality. [pdf]

Paper link bibtex

@inproceedings{missier_quality_2006,
	address = {Seoul, Korea},
	title = {Quality {Views}: {Capturing} and {Exploiting} the {User} {Perspective} on {Data} {Quality}.},
	url = {http://www.vldb.org/conf/2006/p977-missier.pdf},
	booktitle = {Procs. {VLDB}},
	author = {Missier, P and Embury, S M and Greenwood, M and Preece, A D and Jin, B},
	month = sep,
	year = {2006},
	keywords = {\#qurator, "Workflow Management", "Information Quality Management", "e-Science"},
	pages = {977--988},
}

Semantic Support For Meta-Scheduling in Grids. Missier, P; Wieder, P; and Ziegler, W In Knowledge and Data Management in Grids, volume 3, of CoreGRID. Springer, 2006.
link bibtex

@incollection{missier_semantic_2006,
	series = {{CoreGRID}},
	title = {Semantic {Support} {For} {Meta}-{Scheduling} in {Grids}},
	volume = {3},
	booktitle = {Knowledge and {Data} {Management} in {Grids}},
	publisher = {Springer},
	author = {Missier, P and Wieder, P and Ziegler, W},
	year = {2006},
	keywords = {"Semantic Grid", "Grid Systems", "Scheduling"},
}

2005 (6)

Provider issues in quality-constrained data provisioning. Missier, P; and Embury, S M In IQIS 2005, International Workshop on Information Quality in Information Systems, 17 June 2005, Baltimore, Maryland, USA (SIGMOD 2005 Workshop), pages 5–15, 2005.

Provider issues in quality-constrained data provisioning [link]

Paper doi link bibtex

@inproceedings{missier_provider_2005,
	title = {Provider issues in quality-constrained data provisioning},
	url = {http://dx.doi.org/10.1145/1077501.1077507},
	doi = {http://dx.doi.org/10.1145/1077501.1077507},
	booktitle = {{IQIS} 2005, {International} {Workshop} on {Information} {Quality} in {Information} {Systems}, 17 {June} 2005, {Baltimore}, {Maryland}, {USA} ({SIGMOD} 2005 {Workshop})},
	author = {Missier, P and Embury, S M},
	year = {2005},
	keywords = {"Information Quality Management"},
	pages = {5--15},
}

Clustering Web pages based on their structure. Crescenzi, V.; Merialdo, P.; and Missier, P. Data Knowl. Eng., 54: 279–299. 2005.

Clustering Web pages based on their structure. [link]

Paper doi link bibtex 1 download

@article{crescenzi_clustering_2005,
	title = {Clustering {Web} pages based on their structure.},
	volume = {54},
	url = {http://dx.doi.org/10.1016/j.datak.2004.11.004},
	doi = {http://dx.doi.org/10.1016/j.datak.2004.11.004},
	journal = {Data Knowl. Eng.},
	author = {Crescenzi, Valter and Merialdo, Paolo and Missier, Paolo},
	year = {2005},
	pages = {279--299},
}

An ontology-based approach to handling information quality in e-science. Missier, P; Embury, S; Greenwood, M; Preece, A; and Jin, B In Procs. 4th e-Science All Hands Meeting, Nottingham, UK, 2005.

An ontology-based approach to handling information quality in e-science [pdf]

Paper link bibtex

@inproceedings{missier_ontology-based_2005,
	address = {Nottingham, UK},
	title = {An ontology-based approach to handling information quality in e-science},
	url = {http://www.csd.abdn.ac.uk/∼apreece/qurator/resources/qurator_ahm_8page.pdf},
	booktitle = {Procs. 4th e-{Science} {All} {Hands} {Meeting}},
	author = {Missier, P and Embury, S and Greenwood, M and Preece, A and Jin, B},
	year = {2005},
	keywords = {\#qurator, "Information Quality Management", "Automated Reasoning", "e-Science"},
}

Managing Information Quality in e-Science: A Case Study in Proteomics. Missier, P; Preece, A; Embury, S; Jin, B; Greenwood, M; D.Stead; and Brown, A In 1st Workshop on Quality of Information Systems (QoIS 2005), Lecture Notes in Computer Science., volume 3770, pages 423–432, 2005. Springer

Managing Information Quality in e-Science: A Case Study in Proteomics [link]

Paper doi link bibtex

@inproceedings{missier_managing_2005,
	title = {Managing {Information} {Quality} in e-{Science}: {A} {Case} {Study} in {Proteomics}},
	volume = {3770},
	url = {http://dx.doi.org/10.1007/11568346_45},
	doi = {http://dx.doi.org/10.1007/11568346_45},
	booktitle = {1st {Workshop} on {Quality} of {Information} {Systems} ({QoIS} 2005), {Lecture} {Notes} in {Computer} {Science}.},
	publisher = {Springer},
	author = {Missier, P and Preece, A and Embury, S and Jin, B and Greenwood, M and {D.Stead} and Brown, A},
	year = {2005},
	keywords = {\#qurator, "Information Quality Management", "Automated Reasoning", "e-Science"},
	pages = {423--432},
}

Improving Government-to-business relationships through data reconciliation and process re-engineering. Bertoletti, M.; Missier, P.; Scannapieco, M.; Aimetti, P.; and Batini, C. In Advances in Management Information System Monograph Series, of Advances in Management Information System Monograph Series. April 2005.
link bibtex

@incollection{bertoletti_improving_2005,
	series = {Advances in {Management} {Information} {System} {Monograph} {Series}},
	title = {Improving {Government}-to-business relationships through data reconciliation and process re-engineering},
	booktitle = {Advances in {Management} {Information} {System} {Monograph} {Series}},
	author = {Bertoletti, Marco and Missier, Paolo and Scannapieco, Monica and Aimetti, Pietro and Batini, Carlo},
	month = apr,
	year = {2005},
}

Data Quality at a Glance. Scannapieco, M; Missier, P; and Batini, C Datenbank-Spektrum, 14: 6–14. 2005.

Paper link bibtex

@article{scannapieco_data_2005,
	title = {Data {Quality} at a {Glance}.},
	volume = {14},
	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.85.555&rep=rep1&type=pdf},
	journal = {Datenbank-Spektrum},
	author = {Scannapieco, M and Missier, P and Batini, C},
	year = {2005},
	keywords = {Data Quality},
	pages = {6--14},
}

2004 (4)

An Automatic Data Grabber for Large Web Sites. Crescenzi, V.; Mecca, G.; Merialdo, P.; and Missier, P. In Procs. VLDB, pages 1321–1324, 2004.

An Automatic Data Grabber for Large Web Sites. [pdf]

Paper link bibtex

@inproceedings{crescenzi_automatic_2004,
	title = {An {Automatic} {Data} {Grabber} for {Large} {Web} {Sites}.},
	url = {http://www.vldb.org/conf/2004/DEMP18.PDF},
	booktitle = {Procs. {VLDB}},
	author = {Crescenzi, Valter and Mecca, Giansalvatore and Merialdo, Paolo and Missier, Paolo},
	year = {2004},
	pages = {1321--1324},
}

QoS in Multichannel IS: The MAIS Approach. Cappiello, C.; Missier, P.; Pernici, B.; Plebani, P.; and Batini, C. In ICWE Workshops, pages 255–268, 2004.

QoS in Multichannel IS: The MAIS Approach. [link]

Paper link bibtex

@inproceedings{cappiello_qos_2004,
	title = {{QoS} in {Multichannel} {IS}: {The} {MAIS} {Approach}.},
	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.122.3805&rep=rep1&type=pdf},
	booktitle = {{ICWE} {Workshops}},
	author = {Cappiello, Cinzia and Missier, Paolo and Pernici, Barbara and Plebani, Pierluigi and Batini, Carlo},
	year = {2004},
	pages = {255--268},
}

Ontology-Based Question Answering in a Federation of University Sites: The MOSES Case Study. Atzeni, P.; Basili, R.; Hansen, D H; Missier, P.; Paggio, P.; Pazienza, M. T.; and Zanzotto, F. M. In Procs. NLDB, pages 413–420, 2004.

Ontology-Based Question Answering in a Federation of University Sites: The MOSES Case Study. [link]

Paper doi link bibtex

@inproceedings{atzeni_ontology-based_2004,
	title = {Ontology-{Based} {Question} {Answering} in a {Federation} of {University} {Sites}: {The} {MOSES} {Case} {Study}.},
	url = {http://dx.doi.org/10.1007/b98754},
	doi = {http://dx.doi.org/10.1007/b98754},
	booktitle = {Procs. {NLDB}},
	author = {Atzeni, Paolo and Basili, Roberto and Hansen, D H and Missier, Paolo and Paggio, Patrizia and Pazienza, Maria Teresa and Zanzotto, Fabio Massimo},
	year = {2004},
	pages = {413--420},
}

A formulation of the Data Quality Optimization Problem in Cooperative Information Systems. Avenali, A.; Bertolazzi, P.; Batini, C.; and Missier, P. In CAiSE Workshops (2), pages 49–63, 2004.

A formulation of the Data Quality Optimization Problem in Cooperative Information Systems. [link]

Paper link bibtex

@inproceedings{avenali_formulation_2004,
	title = {A formulation of the {Data} {Quality} {Optimization} {Problem} in {Cooperative} {Information} {Systems}.},
	url = {http://dblp.uni-trier.de/db/conf/caise/caisews2004-2.html#AvenaliBBM04},
	booktitle = {{CAiSE} {Workshops} (2)},
	author = {Avenali, Alessandro and Bertolazzi, Paola and Batini, Carlo and Missier, Paolo},
	year = {2004},
	pages = {49--63},
}

2003 (6)

Fine-grain web site structure discovery. Crescenzi, V.; Merialdo, P.; and Missier, P. In Procs. WIDM, pages 15–22, 2003.

Fine-grain web site structure discovery. [link]

Paper doi link bibtex 1 download

@inproceedings{crescenzi_fine-grain_2003,
	title = {Fine-grain web site structure discovery.},
	url = {http://dx.doi.org/10.1145/956699.956703},
	doi = {http://dx.doi.org/10.1145/956699.956703},
	booktitle = {Procs. {WIDM}},
	author = {Crescenzi, Valter and Merialdo, Paolo and Missier, Paolo},
	year = {2003},
	pages = {15--22},
}

The Service to Businesses Project: Improving Government-to-Business Relationships in Italy. Bertoletti, M.; Missier, P.; Scannapieco, M.; Aimetti, P.; and Batini, C. In Procs. EGOV, pages 468–471, 2003.

The Service to Businesses Project: Improving Government-to-Business Relationships in Italy. [link]

Paper doi link bibtex

@inproceedings{bertoletti_service_2003,
	title = {The {Service} to {Businesses} {Project}: {Improving} {Government}-to-{Business} {Relationships} in {Italy}.},
	url = {http://dx.doi.org/10.1007/b11827},
	doi = {http://dx.doi.org/10.1007/b11827},
	booktitle = {Procs. {EGOV}},
	author = {Bertoletti, Marco and Missier, Paolo and Scannapieco, Monica and Aimetti, Pietro and Batini, Carlo},
	year = {2003},
	pages = {468--471},
}

An Information Quality Management Framework for Cooperative Information Systems. Missier, P; and Batini, C In Procs. ISE 2003, Montreal, Canada, July 2003.
link bibtex

@inproceedings{missier_information_2003,
	address = {Montreal, Canada},
	title = {An {Information} {Quality} {Management} {Framework} for {Cooperative} {Information} {Systems}},
	booktitle = {Procs. {ISE} 2003},
	author = {Missier, P and Batini, C},
	month = jul,
	year = {2003},
}

A Multidimensional Model for Information Quality in Cooperative Systems. Missier, P; and Batini, C In Proceedings of 8th International Conference on Information Quality (IQ'03), pages 25–40, 2003.
link bibtex

@inproceedings{missier_multidimensional_2003,
	title = {A {Multidimensional} {Model} for {Information} {Quality} in {Cooperative} {Systems}},
	booktitle = {Proceedings of 8th {International} {Conference} on {Information} {Quality} ({IQ}'03)},
	author = {Missier, P and Batini, C},
	year = {2003},
	pages = {25--40},
}

A model for Information Quality management in Cooperative Information Systems. Missier, P.; and Batini, C. In SEBD, pages 191–206, 2003.
link bibtex

@inproceedings{missier_model_2003,
	title = {A model for {Information} {Quality} management in {Cooperative} {Information} {Systems}.},
	booktitle = {{SEBD}},
	author = {Missier, Paolo and Batini, Carlo},
	year = {2003},
	pages = {191--206},
}

Improving Data Quality in Practice: A Case Study in the Italian Public Administration. Missier, P; Lalk, G; Verykios, V S; Grillo, F; Lorusso, T; and Angeletti, P Distributed and Parallel Databases, 13: 135–160. 2003.

Improving Data Quality in Practice: A Case Study in the Italian Public Administration. [link]

Paper link bibtex

@article{missier_improving_2003,
	title = {Improving {Data} {Quality} in {Practice}: {A} {Case} {Study} in the {Italian} {Public} {Administration}.},
	volume = {13},
	url = {http://www.springerlink.com/content/x80m0245551j0202/},
	journal = {Distributed and Parallel Databases},
	author = {Missier, P and Lalk, G and Verykios, V S and Grillo, F and Lorusso, T and Angeletti, P},
	year = {2003},
	pages = {135--160},
}

2001 (2)

Eguru: a decision support system for the assisted design of e-commerce architectures. Missier, P; Bianchi, M; Zordan, A; and Umar, A In Knowledge Management & Intelligent Enterprises - Industrial Volume. Procs. 9th IFIP 2.6 Working Conference on Database Semantics (DS-9), Hong Kong, April 2001.

Eguru: a decision support system for the assisted design of e-commerce architectures [link]

Paper link bibtex

@inproceedings{missier_eguru_2001,
	address = {Hong Kong},
	title = {Eguru: a decision support system for the assisted design of e-commerce architectures},
	url = {http://books.google.co.uk/books?hl=en&lr=&id=K8Z8GvYrbEcC&oi=fnd&pg=PA69&dq=related:d41Mrh54DIgJ:scholar.google.com/&ots=zF1CSZ73jM&sig=pl3HEwWOt7sQ6HFWqWiNkR5YD5w#v=onepage&q=&f=false},
	booktitle = {Knowledge {Management} \& {Intelligent} {Enterprises} - {Industrial} {Volume}. {Procs}. 9th {IFIP} 2.6 {Working} {Conference} on {Database} {Semantics} ({DS}-9)},
	author = {Missier, P and Bianchi, M and Zordan, A and Umar, A},
	month = apr,
	year = {2001},
}

CitiTime: a system for rapid creation of portable next-generation telephony services. Anjum, F.; Caruso, F.; Jain, R.; Missier, P.; and Zordan, A. Computer Networks, 35: 579–595. 2001.

CitiTime: a system for rapid creation of portable next-generation telephony services. [link]

Paper link bibtex

@article{anjum_cititime_2001,
	title = {{CitiTime}: a system for rapid creation of portable next-generation telephony services.},
	volume = {35},
	url = {http://dx.doi.org/10.1016/S1389-1286(00)00195-X},
	journal = {Computer Networks},
	author = {Anjum, Farooq and Caruso, Francesco and Jain, Ravi and Missier, Paolo and Zordan, Adalberto},
	year = {2001},
	pages = {579--595},
}

2000 (3)

Demonstration of Telcordia's Database Reconciliation and Data Quality Analysis Tool. Caruso, F; Cochinwala, M; Ganapathy, U; Lalk, G; and Missier, P In VLDB 2000, September 10-14, 2000, Cairo, Egypt, pages 615–618, 2000. Morgan Kaufmann

Demonstration of Telcordia's Database Reconciliation and Data Quality Analysis Tool [link]

Paper link bibtex

@inproceedings{caruso_demonstration_2000,
	title = {Demonstration of {Telcordia}'s {Database} {Reconciliation} and {Data} {Quality} {Analysis} {Tool}},
	isbn = {1-55860-715-3},
	url = {http://dblp.uni-trier.de/db/conf/vldb/CarusoCGLM00.html},
	booktitle = {{VLDB} 2000, {September} 10-14, 2000, {Cairo}, {Egypt}},
	publisher = {Morgan Kaufmann},
	author = {Caruso, F and Cochinwala, M and Ganapathy, U and Lalk, G and Missier, P},
	year = {2000},
	pages = {615--618},
}

Java Call Control, Coordination and Transactions. Jain, R; Anjum, F; Missier, P; and Shastry, S IEEE Communications. January 2000.

Java Call Control, Coordination and Transactions [link]

Paper link bibtex

@article{jain_java_2000,
	title = {Java {Call} {Control}, {Coordination} and {Transactions}},
	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.16.8466&rep=rep1&type=pdf},
	journal = {IEEE Communications},
	author = {Jain, R and Anjum, F and Missier, P and Shastry, S},
	month = jan,
	year = {2000},
}

A Knowledge-Based Decision Support Workbench for Advanced Ecommerce. Umar, A.; Bianchi, M.; Caruso, F.; and Missier, P. In AIWoRC, pages 93–100, 2000.

A Knowledge-Based Decision Support Workbench for Advanced Ecommerce. [link]

Paper link bibtex

@inproceedings{umar_knowledge-based_2000,
	title = {A {Knowledge}-{Based} {Decision} {Support} {Workbench} for {Advanced} {Ecommerce}.},
	url = {http://csdl.computer.org/comp/proceedings/aiworc/2000/0628/00/06280093abs.htm},
	booktitle = {{AIWoRC}},
	author = {Umar, Amjad and Bianchi, Michelle and Caruso, Francesco and Missier, Paolo},
	year = {2000},
	pages = {93--100},
}

1999 (4)

ChaiTime:A System for Rapid Creation of Portable Next-Generation Telephony Services Using Third-Party Software Components. Anjum, F; Caruso, F; Jain, R; Missier, P; and Zordan, A In Procs. Second IEEE Conference on Open Architectures and Network Programming (OpenArch), New York, March 1999.

ChaiTime:A System for Rapid Creation of Portable Next-Generation Telephony Services Using Third-Party Software Components [link]

Paper doi link bibtex

@inproceedings{anjum_chaitimesystem_1999,
	address = {New York},
	title = {{ChaiTime}:{A} {System} for {Rapid} {Creation} of {Portable} {Next}-{Generation} {Telephony} {Services} {Using} {Third}-{Party} {Software} {Components}},
	url = {http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=758431},
	doi = {10.1109/OPNARC.1999.758431},
	booktitle = {Procs. {Second} {IEEE} {Conference} on {Open} {Architectures} and {Network} {Programming} ({OpenArch})},
	author = {Anjum, F and Caruso, F and Jain, R and Missier, P and Zordan, A},
	month = mar,
	year = {1999},
}

Integration of Highly Fragmented Legacy Information Systems Through Object Modeling and Layered Wrappers. Mecella, M; Missier, P; Massari; and Batini, C In Procs. AICA99, Italy, 1999.
link bibtex

@inproceedings{mecella_integration_1999,
	address = {Italy},
	title = {Integration of {Highly} {Fragmented} {Legacy} {Information} {Systems} {Through} {Object} {Modeling} and {Layered} {Wrappers}},
	booktitle = {Procs. {AICA99}},
	author = {Mecella, M and Missier, P and {Massari} and Batini, C},
	year = {1999},
}

A Knowledge-based Decision Support Workbench for Enterprise Resource Integration and Migration. Umar, A; and Missier, P In Procs. First International Workshop on Enterprise Management and Resource Planning Systems (EMRPS99), Venice, Italy, 1999.
link bibtex

@inproceedings{umar_knowledge-based_1999,
	address = {Venice, Italy},
	title = {A {Knowledge}-based {Decision} {Support} {Workbench} for {Enterprise} {Resource} {Integration} and {Migration}},
	booktitle = {Procs. {First} {International} {Workshop} on {Enterprise} {Management} and {Resource} {Planning} {Systems} ({EMRPS99})},
	author = {Umar, A and Missier, P},
	year = {1999},
}

A Framework for Analyzing Virtual Enterprise Infrastructure. Umar, A.; and Missier, P. In RIDE, pages 4–11, 1999.

A Framework for Analyzing Virtual Enterprise Infrastructure. [link]

Paper link bibtex

@inproceedings{umar_framework_1999,
	title = {A {Framework} for {Analyzing} {Virtual} {Enterprise} {Infrastructure}.},
	url = {http://computer.org/conferen/proceed/ride/0119/01190004abs.htm},
	booktitle = {{RIDE}},
	author = {Umar, Amjad and Missier, Paolo},
	year = {1999},
	pages = {4--11},
}

1998 (2)

Multidatabase Languages. Missier, P; Rusinkiewicz, M; and Jin, W In Management of Heterogeneous and Autonomous Database Systems. Morgan Kauffman, 1998.

Paper link bibtex 1 download

@incollection{missier_multidatabase_1998,
	title = {Multidatabase {Languages}},
	url = {http://books.google.co.uk/books?hl=en&lr=&id=BvuTyyMtGbAC&oi=fnd&pg=PA175&dq=IFIP+Conference+Proceedings+1995+missier&ots=ubXv_IiwE0&sig=fUxrDxeLlchFL9ckMRndXZRXH9E},
	booktitle = {Management of {Heterogeneous} and {Autonomous} {Database} {Systems}},
	publisher = {Morgan Kauffman},
	author = {Missier, P and Rusinkiewicz, M and Jin, W},
	year = {1998},
}

Technology for the Copyright Protection of Digital Image. Missier, P In Monography Bullettin on archiving in Art History. Scuola Normale Superiore di Pisa, Centro Ricerche Informatiche, 1998.
link bibtex

@incollection{missier_technology_1998,
	title = {Technology for the {Copyright} {Protection} of {Digital} {Image}},
	booktitle = {Monography {Bullettin} on archiving in {Art} {History}},
	publisher = {Scuola Normale Superiore di Pisa, Centro Ricerche Informatiche},
	author = {Missier, P},
	year = {1998},
}

1995 (2)

Providing Multidatabase Access - an Association Approach. Missier, P; Rusinkiewicz, M; and Silberschatz, A In Procs. 6th International Hong Kong Computer Society Database Workshop on Database Reengineering and Interoperability, Hong Kong, March 1995.

Providing Multidatabase Access - an Association Approach [link]

Paper link bibtex

@inproceedings{missier_providing_1995,
	address = {Hong Kong},
	title = {Providing {Multidatabase} {Access} - an {Association} {Approach}},
	url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.30.4719},
	booktitle = {Procs. 6th {International} {Hong} {Kong} {Computer} {Society} {Database} {Workshop} on {Database} {Reengineering} and {Interoperability}},
	author = {Missier, P and Rusinkiewicz, M and Silberschatz, A},
	month = mar,
	year = {1995},
}

Extending a Multidatabase Manipulation Language to Resolve Schema and Data Conflicts. Missier, P.; and Rusinkiewicz, M. In DS-6, pages 93–115, 1995.

Extending a Multidatabase Manipulation Language to Resolve Schema and Data Conflicts. [link]

Paper link bibtex 1 download

@inproceedings{missier_extending_1995,
	title = {Extending a {Multidatabase} {Manipulation} {Language} to {Resolve} {Schema} and {Data} {Conflicts}.},
	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1.5127&rep=rep1&type=pdf},
	booktitle = {{DS}-6},
	author = {Missier, Paolo and Rusinkiewicz, Marek},
	year = {1995},
	pages = {93--115},
}

1992 (1)

Semantic unification in the inference of union types. Missier, P In Procs. GULP'92 (Logic Programming), Tremezzo, Como, Italy, 1992.
link bibtex

@inproceedings{missier_semantic_1992,
	address = {Tremezzo, Como, Italy},
	title = {Semantic unification in the inference of union types},
	booktitle = {Procs. {GULP}'92 ({Logic} {Programming})},
	author = {Missier, P},
	year = {1992},
}

undefined (1)

Optimising Fairness Through Parametrised Data Sampling. González-Zelaya, V.; Prangle, D.; Salas, J.; and Missier, P. ,6. .
link bibtex abstract

@article{gonzalez-zelaya_optimising_nodate,
	title = {Optimising {Fairness} {Through} {Parametrised} {Data} {Sampling}},
	abstract = {Improving machine learning models’ fairness is an active research topic, with most approaches focusing on specific definitions of fairness. In contrast, we propose ParDS, a parametrised data sampling method by which we can optimise the fairness ratios observed on a test set, in a way that is agnostic to both the specific fairness definitions, and the chosen classification model. Given a training set with one binary protected attribute and a binary label, our approach involves correcting the positive rate for both the favoured and unfavoured groups through resampling of the training set. We present experimental evidence showing that the amount of resampling can be optimised to achieve target fairness ratios for a specific training set and fairness definition, while preserving most of the model’s accuracy. We discuss conditions for the method to be viable, and then extend the method to include multiple protected attributes. In our experiments we use three different sampling strategies, and we report results for three commonly used definitions of fairness, and three public benchmark datasets: Adult Income, COMPAS and German Credit.},
	language = {en},
	author = {González-Zelaya, Vladimiro and Prangle, Dennis and Salas, Julián and Missier, Paolo},
	pages = {6},
	file = {González-Zelaya et al. - Optimising Fairness Through Parametrised Data Samp.pdf:/Users/npm65/Zotero/storage/G2VNJ4CU/González-Zelaya et al. - Optimising Fairness Through Parametrised Data Samp.pdf:application/pdf},
}