A Survey on Semi-supervised Learning for Delayed Partially Labelled Data Streams. Gomes, H. M., Grzenda, M., Mello, R., Read, J., Le Nguyen, M. H., & Bifet, A. ACM Computing Surveys, 55(4):75:1–75:42, November, 2022. Paper doi abstract bibtex Unlabelled data appear in many domains and are particularly relevant to streaming applications, where even though data is abundant, labelled data is rare. To address the learning problems associated with such data, one can ignore the unlabelled data and focus only on the labelled data (supervised learning); use the labelled data and attempt to leverage the unlabelled data (semi-supervised learning); or assume some labels will be available on request (active learning). The first approach is the simplest, yet the amount of labelled data available will limit the predictive performance. The second relies on finding and exploiting the underlying characteristics of the data distribution. The third depends on an external agent to provide the required labels in a timely fashion. This survey pays special attention to methods that leverage unlabelled data in a semi-supervised setting. We also discuss the delayed labelling issue, which impacts both fully supervised and semi-supervised methods. We propose a unified problem setting, discuss the learning guarantees and existing methods, and explain the differences between related problem settings. Finally, we review the current benchmarking practices and propose adaptations to enhance them.
@article{gomes_survey_2022,
title = {A {Survey} on {Semi}-supervised {Learning} for {Delayed} {Partially} {Labelled} {Data} {Streams}},
volume = {55},
issn = {0360-0300},
url = {https://dl.acm.org/doi/10.1145/3523055},
doi = {10.1145/3523055},
abstract = {Unlabelled data appear in many domains and are particularly relevant to streaming applications, where even though data is abundant, labelled data is rare. To address the learning problems associated with such data, one can ignore the unlabelled data and focus only on the labelled data (supervised learning); use the labelled data and attempt to leverage the unlabelled data (semi-supervised learning); or assume some labels will be available on request (active learning). The first approach is the simplest, yet the amount of labelled data available will limit the predictive performance. The second relies on finding and exploiting the underlying characteristics of the data distribution. The third depends on an external agent to provide the required labels in a timely fashion. This survey pays special attention to methods that leverage unlabelled data in a semi-supervised setting. We also discuss the delayed labelling issue, which impacts both fully supervised and semi-supervised methods. We propose a unified problem setting, discuss the learning guarantees and existing methods, and explain the differences between related problem settings. Finally, we review the current benchmarking practices and propose adaptations to enhance them.},
number = {4},
urldate = {2023-03-31},
journal = {ACM Computing Surveys},
author = {Gomes, Heitor Murilo and Grzenda, Maciej and Mello, Rodrigo and Read, Jesse and Le Nguyen, Minh Huong and Bifet, Albert},
month = nov,
year = {2022},
keywords = {Semi-supervised learning, concept drift, data streams, delayed labeling, verification latency},
pages = {75:1--75:42},
}
Downloads: 0
{"_id":"JTpDDcTSxhreN7oFq","bibbaseid":"gomes-grzenda-mello-read-lenguyen-bifet-asurveyonsemisupervisedlearningfordelayedpartiallylabelleddatastreams-2022","author_short":["Gomes, H. M.","Grzenda, M.","Mello, R.","Read, J.","Le Nguyen, M. H.","Bifet, A."],"bibdata":{"bibtype":"article","type":"article","title":"A Survey on Semi-supervised Learning for Delayed Partially Labelled Data Streams","volume":"55","issn":"0360-0300","url":"https://dl.acm.org/doi/10.1145/3523055","doi":"10.1145/3523055","abstract":"Unlabelled data appear in many domains and are particularly relevant to streaming applications, where even though data is abundant, labelled data is rare. To address the learning problems associated with such data, one can ignore the unlabelled data and focus only on the labelled data (supervised learning); use the labelled data and attempt to leverage the unlabelled data (semi-supervised learning); or assume some labels will be available on request (active learning). The first approach is the simplest, yet the amount of labelled data available will limit the predictive performance. The second relies on finding and exploiting the underlying characteristics of the data distribution. The third depends on an external agent to provide the required labels in a timely fashion. This survey pays special attention to methods that leverage unlabelled data in a semi-supervised setting. We also discuss the delayed labelling issue, which impacts both fully supervised and semi-supervised methods. We propose a unified problem setting, discuss the learning guarantees and existing methods, and explain the differences between related problem settings. Finally, we review the current benchmarking practices and propose adaptations to enhance them.","number":"4","urldate":"2023-03-31","journal":"ACM Computing Surveys","author":[{"propositions":[],"lastnames":["Gomes"],"firstnames":["Heitor","Murilo"],"suffixes":[]},{"propositions":[],"lastnames":["Grzenda"],"firstnames":["Maciej"],"suffixes":[]},{"propositions":[],"lastnames":["Mello"],"firstnames":["Rodrigo"],"suffixes":[]},{"propositions":[],"lastnames":["Read"],"firstnames":["Jesse"],"suffixes":[]},{"propositions":[],"lastnames":["Le","Nguyen"],"firstnames":["Minh","Huong"],"suffixes":[]},{"propositions":[],"lastnames":["Bifet"],"firstnames":["Albert"],"suffixes":[]}],"month":"November","year":"2022","keywords":"Semi-supervised learning, concept drift, data streams, delayed labeling, verification latency","pages":"75:1–75:42","bibtex":"@article{gomes_survey_2022,\n\ttitle = {A {Survey} on {Semi}-supervised {Learning} for {Delayed} {Partially} {Labelled} {Data} {Streams}},\n\tvolume = {55},\n\tissn = {0360-0300},\n\turl = {https://dl.acm.org/doi/10.1145/3523055},\n\tdoi = {10.1145/3523055},\n\tabstract = {Unlabelled data appear in many domains and are particularly relevant to streaming applications, where even though data is abundant, labelled data is rare. To address the learning problems associated with such data, one can ignore the unlabelled data and focus only on the labelled data (supervised learning); use the labelled data and attempt to leverage the unlabelled data (semi-supervised learning); or assume some labels will be available on request (active learning). The first approach is the simplest, yet the amount of labelled data available will limit the predictive performance. The second relies on finding and exploiting the underlying characteristics of the data distribution. The third depends on an external agent to provide the required labels in a timely fashion. This survey pays special attention to methods that leverage unlabelled data in a semi-supervised setting. We also discuss the delayed labelling issue, which impacts both fully supervised and semi-supervised methods. We propose a unified problem setting, discuss the learning guarantees and existing methods, and explain the differences between related problem settings. Finally, we review the current benchmarking practices and propose adaptations to enhance them.},\n\tnumber = {4},\n\turldate = {2023-03-31},\n\tjournal = {ACM Computing Surveys},\n\tauthor = {Gomes, Heitor Murilo and Grzenda, Maciej and Mello, Rodrigo and Read, Jesse and Le Nguyen, Minh Huong and Bifet, Albert},\n\tmonth = nov,\n\tyear = {2022},\n\tkeywords = {Semi-supervised learning, concept drift, data streams, delayed labeling, verification latency},\n\tpages = {75:1--75:42},\n}\n\n\n\n","author_short":["Gomes, H. M.","Grzenda, M.","Mello, R.","Read, J.","Le Nguyen, M. H.","Bifet, A."],"key":"gomes_survey_2022","id":"gomes_survey_2022","bibbaseid":"gomes-grzenda-mello-read-lenguyen-bifet-asurveyonsemisupervisedlearningfordelayedpartiallylabelleddatastreams-2022","role":"author","urls":{"Paper":"https://dl.acm.org/doi/10.1145/3523055"},"keyword":["Semi-supervised learning","concept drift","data streams","delayed labeling","verification latency"],"metadata":{"authorlinks":{}},"html":""},"bibtype":"article","biburl":"https://bibbase.org/zotero/mh_lenguyen","dataSources":["iwKepCrWBps7ojhDx"],"keywords":["semi-supervised learning","concept drift","data streams","delayed labeling","verification latency"],"search_terms":["survey","semi","supervised","learning","delayed","partially","labelled","data","streams","gomes","grzenda","mello","read","le nguyen","bifet"],"title":"A Survey on Semi-supervised Learning for Delayed Partially Labelled Data Streams","year":2022}