Content Profiling for Preservation: Improving Scale, Depth and Quality. Kulmukhametov, A. & Becker, C. In Tuamsuk, K., Jatowt, A., & Rasmussen, E., editors, The Emergence of Digital Libraries – Research and Practices, of Lecture Notes in Computer Science, pages 1–11. Springer International Publishing, November, 2014. abstract bibtex Content profiling in digital preservation is a crucial step that enables controlled management of content over time. However, large-scale profiling is facing a set of challenges. As data grows and gets more diverse, the only option to control it is to combine outputs of multiple characterization tools to cover the varieties of formats and extract features of interest. This cooperation of tools introduces conflicting measures and poses challenges on data quality. Sparsity and labeling conflicts make it difficult or impossible to partition, sample and analyze large metadata sets of a content profile. Without this, however, it is virtually impossible to manage heterogeneous collections reliably over time. In this paper, we present the content profiling tool C3PO, which includes rule-based techniques and heuristics designed for conflict reduction. We conduct a set of experiments in which we assess the effect of creating such a mechanisms and rule set on the quality and effectiveness of content profiling. The results show the potential of simple conflict reduction rules to strongly improve data quality of content profiling for analysis and decision support.
@incollection{kulmukhametov_content_2014,
series = {Lecture {Notes} in {Computer} {Science}},
title = {Content {Profiling} for {Preservation}: {Improving} {Scale}, {Depth} and {Quality}},
copyright = {©2014 Springer International Publishing Switzerland},
isbn = {978-3-319-12822-1 978-3-319-12823-8},
shorttitle = {Content {Profiling} for {Preservation}},
abstract = {Content profiling in digital preservation is a crucial step that enables controlled management of content over time. However, large-scale profiling is facing a set of challenges. As data grows and gets more diverse, the only option to control it is to combine outputs of multiple characterization tools to cover the varieties of formats and extract features of interest. This cooperation of tools introduces conflicting measures and poses challenges on data quality. Sparsity and labeling conflicts make it difficult or impossible to partition, sample and analyze large metadata sets of a content profile. Without this, however, it is virtually impossible to manage heterogeneous collections reliably over time. In this paper, we present the content profiling tool C3PO, which includes rule-based techniques and heuristics designed for conflict reduction. We conduct a set of experiments in which we assess the effect of creating such a mechanisms and rule set on the quality and effectiveness of content profiling. The results show the potential of simple conflict reduction rules to strongly improve data quality of content profiling for analysis and decision support.},
language = {en},
number = {8839},
urldate = {2015-07-25},
booktitle = {The {Emergence} of {Digital} {Libraries} – {Research} and {Practices}},
publisher = {Springer International Publishing},
author = {Kulmukhametov, Artur and Becker, Christoph},
editor = {Tuamsuk, Kulthida and Jatowt, Adam and Rasmussen, Edie},
month = nov,
year = {2014},
keywords = {Characterization, Conflict Reduction, Content Profiling, Database Management, Document Preparation and Text Processing, Information Storage and Retrieval, Information Systems Applications (incl. Internet), digital preservation},
pages = {1--11}
}
Downloads: 0
{"_id":"87oxCkT9F7gXhgZAX","bibbaseid":"kulmukhametov-becker-contentprofilingforpreservationimprovingscaledepthandquality-2014","authorIDs":["5dab7de1c19d46da01000042"],"author_short":["Kulmukhametov, A.","Becker, C."],"bibdata":{"bibtype":"incollection","type":"incollection","series":"Lecture Notes in Computer Science","title":"Content Profiling for Preservation: Improving Scale, Depth and Quality","copyright":"©2014 Springer International Publishing Switzerland","isbn":"978-3-319-12822-1 978-3-319-12823-8","shorttitle":"Content Profiling for Preservation","abstract":"Content profiling in digital preservation is a crucial step that enables controlled management of content over time. However, large-scale profiling is facing a set of challenges. As data grows and gets more diverse, the only option to control it is to combine outputs of multiple characterization tools to cover the varieties of formats and extract features of interest. This cooperation of tools introduces conflicting measures and poses challenges on data quality. Sparsity and labeling conflicts make it difficult or impossible to partition, sample and analyze large metadata sets of a content profile. Without this, however, it is virtually impossible to manage heterogeneous collections reliably over time. In this paper, we present the content profiling tool C3PO, which includes rule-based techniques and heuristics designed for conflict reduction. We conduct a set of experiments in which we assess the effect of creating such a mechanisms and rule set on the quality and effectiveness of content profiling. The results show the potential of simple conflict reduction rules to strongly improve data quality of content profiling for analysis and decision support.","language":"en","number":"8839","urldate":"2015-07-25","booktitle":"The Emergence of Digital Libraries – Research and Practices","publisher":"Springer International Publishing","author":[{"propositions":[],"lastnames":["Kulmukhametov"],"firstnames":["Artur"],"suffixes":[]},{"propositions":[],"lastnames":["Becker"],"firstnames":["Christoph"],"suffixes":[]}],"editor":[{"propositions":[],"lastnames":["Tuamsuk"],"firstnames":["Kulthida"],"suffixes":[]},{"propositions":[],"lastnames":["Jatowt"],"firstnames":["Adam"],"suffixes":[]},{"propositions":[],"lastnames":["Rasmussen"],"firstnames":["Edie"],"suffixes":[]}],"month":"November","year":"2014","keywords":"Characterization, Conflict Reduction, Content Profiling, Database Management, Document Preparation and Text Processing, Information Storage and Retrieval, Information Systems Applications (incl. Internet), digital preservation","pages":"1–11","bibtex":"@incollection{kulmukhametov_content_2014,\n\tseries = {Lecture {Notes} in {Computer} {Science}},\n\ttitle = {Content {Profiling} for {Preservation}: {Improving} {Scale}, {Depth} and {Quality}},\n\tcopyright = {©2014 Springer International Publishing Switzerland},\n\tisbn = {978-3-319-12822-1 978-3-319-12823-8},\n\tshorttitle = {Content {Profiling} for {Preservation}},\n\tabstract = {Content profiling in digital preservation is a crucial step that enables controlled management of content over time. However, large-scale profiling is facing a set of challenges. As data grows and gets more diverse, the only option to control it is to combine outputs of multiple characterization tools to cover the varieties of formats and extract features of interest. This cooperation of tools introduces conflicting measures and poses challenges on data quality. Sparsity and labeling conflicts make it difficult or impossible to partition, sample and analyze large metadata sets of a content profile. Without this, however, it is virtually impossible to manage heterogeneous collections reliably over time. In this paper, we present the content profiling tool C3PO, which includes rule-based techniques and heuristics designed for conflict reduction. We conduct a set of experiments in which we assess the effect of creating such a mechanisms and rule set on the quality and effectiveness of content profiling. The results show the potential of simple conflict reduction rules to strongly improve data quality of content profiling for analysis and decision support.},\n\tlanguage = {en},\n\tnumber = {8839},\n\turldate = {2015-07-25},\n\tbooktitle = {The {Emergence} of {Digital} {Libraries} – {Research} and {Practices}},\n\tpublisher = {Springer International Publishing},\n\tauthor = {Kulmukhametov, Artur and Becker, Christoph},\n\teditor = {Tuamsuk, Kulthida and Jatowt, Adam and Rasmussen, Edie},\n\tmonth = nov,\n\tyear = {2014},\n\tkeywords = {Characterization, Conflict Reduction, Content Profiling, Database Management, Document Preparation and Text Processing, Information Storage and Retrieval, Information Systems Applications (incl. Internet), digital preservation},\n\tpages = {1--11}\n}\n\n","author_short":["Kulmukhametov, A.","Becker, C."],"editor_short":["Tuamsuk, K.","Jatowt, A.","Rasmussen, E."],"key":"kulmukhametov_content_2014","id":"kulmukhametov_content_2014","bibbaseid":"kulmukhametov-becker-contentprofilingforpreservationimprovingscaledepthandquality-2014","role":"author","urls":{},"keyword":["Characterization","Conflict Reduction","Content Profiling","Database Management","Document Preparation and Text Processing","Information Storage and Retrieval","Information Systems Applications (incl. Internet)","digital preservation"],"downloads":0},"bibtype":"incollection","biburl":"https://api.zotero.org/users/1882260/collections/WY5EN7U2/items?key=8lA9TgFr1iNyPVQJeyuedzZ6&format=bibtex&limit=100","creationDate":"2019-10-19T21:19:29.401Z","downloads":0,"keywords":["characterization","conflict reduction","content profiling","database management","document preparation and text processing","information storage and retrieval","information systems applications (incl. internet)","digital preservation"],"search_terms":["content","profiling","preservation","improving","scale","depth","quality","kulmukhametov","becker"],"title":"Content Profiling for Preservation: Improving Scale, Depth and Quality","year":2014,"dataSources":["oeMgiGwhuu6rrzzZD"]}