doi abstract bibtex

An increasing number of real-world applications are associated with streaming data drawn from drifting and nonstationary distributions that change over time. These applications demand new algorithms that can learn and adapt to such changes, also known as concept drift. Proper characterization of such data with existing approaches typically requires substantial amount of labeled instances, which may be difficult, expensive, or even impractical to obtain. In this paper, we introduce compacted object sample extraction (COMPOSE), a computational geometry-based framework to learn from nonstationary streaming data, where labels are unavailable (or presented very sporadically) after initialization. We introduce the algorithm in detail, and discuss its results and performances on several synthetic and real-world data sets, which demonstrate the ability of the algorithm to learn under several different scenarios of initially labeled streaming environments. On carefully designed synthetic data sets, we compare the performance of COMPOSE against the optimal Bayes classifier, as well as the arbitrary subpopulation tracker algorithm, which addresses a similar environment referred to as extreme verification latency. Furthermore, using the real-world National Oceanic and Atmospheric Administration weather data set, we demonstrate that COMPOSE is competitive even with a well-established and fully supervised nonstationary learning algorithm that receives labeled data in every batch.

@Article{Dyer2014, Title = {COMPOSE: A Semisupervised Learning Framework for Initially Labeled Nonstationary Streaming Data}, Author = {Dyer, K. B. and Capo, R. and Polikar, R.}, Journal = {IEEE Transactions on Neural Networks and Learning Systems}, Year = {2014}, Month = {Jan}, Number = {1}, Pages = {12-26}, Volume = {25}, Abstract = {An increasing number of real-world applications are associated with streaming data drawn from drifting and nonstationary distributions that change over time. These applications demand new algorithms that can learn and adapt to such changes, also known as concept drift. Proper characterization of such data with existing approaches typically requires substantial amount of labeled instances, which may be difficult, expensive, or even impractical to obtain. In this paper, we introduce compacted object sample extraction (COMPOSE), a computational geometry-based framework to learn from nonstationary streaming data, where labels are unavailable (or presented very sporadically) after initialization. We introduce the algorithm in detail, and discuss its results and performances on several synthetic and real-world data sets, which demonstrate the ability of the algorithm to learn under several different scenarios of initially labeled streaming environments. On carefully designed synthetic data sets, we compare the performance of COMPOSE against the optimal Bayes classifier, as well as the arbitrary subpopulation tracker algorithm, which addresses a similar environment referred to as extreme verification latency. Furthermore, using the real-world National Oceanic and Atmospheric Administration weather data set, we demonstrate that COMPOSE is competitive even with a well-established and fully supervised nonstationary learning algorithm that receives labeled data in every batch.}, Doi = {10.1109/TNNLS.2013.2277712}, ISSN = {2162-237X}, Keywords = {Bayes methods;computational geometry;data handling;geophysics computing;learning (artificial intelligence);meteorology;pattern classification;COMPOSE;compacted object sample extraction;computational geometry-based framework;concept drift;drifting distributions;extreme verification latency;initially labeled nonstationary streaming data;labeled instances;nonstationary distributions;optimal Bayes classifier;real-world national oceanic and atmospheric administration weather data set;semisupervised learning framework;Algorithm design and analysis;Classification algorithms;Data mining;Semisupervised learning;Shape;Signal processing algorithms;Alpha shape;concept drift;nonstationary environment;semisupervised learning (SSL);verification latency}, Review = {A form of clustering approach, where they attempt to track slowly moving distributions: - by constructing an alpha shape (a n-dimensional convex hull) around the data points of the two classes - classify unknown data points with this classifier - remove outlying data points to form a core shape - the data points in the core shape is used train the next iteration of the classifier This way, the classifier is pruning out outlier points, and can move the distribution according to the input data, assuming the jumps are not dramatic.}, Timestamp = {2015.04.14} }

Downloads: 0