Volume 8726 LNAI , 2014.

abstract bibtex

abstract bibtex

We propose a set of novel methodologies which enable valid statistical hypothesis testing when we have only positive and unlabelled (PU) examples. This type of problem, a special case of semi-supervised data, is common in text mining, bioinformatics, and computer vision. Focusing on a generalised likelihood ratio test, we have 3 key contributions: (1) a proof that assuming all unlabelled examples are negative cases is sufficient for independence testing, but not for power analysis activities; (2) a new methodology that compensates this and enables power analysis, allowing sample size determination for observing an effect with a desired power; and finally, (3) a new capability, supervision determination, which can determine a-priori the number of labelled examples the user must collect before being able to observe a desired statistical effect. Beyond general hypothesis testing, we suggest the tools will additionally be useful for information theoretic feature selection, and Bayesian Network structure learning. © 2014 Springer-Verlag.

@book{ title = {Statistical hypothesis testing in positive unlabelled data}, type = {book}, year = {2014}, pages = {66-81}, volume = {8726 LNAI}, issue = {PART 3}, id = {6f06d9c1-29b5-39f0-a0fe-53ded68ac897}, created = {2021-11-12T08:30:19.550Z}, file_attached = {false}, profile_id = {789246de-927b-32cc-ae4f-1b7e2b31674c}, group_id = {e3c82d43-35db-3bbb-b28a-0fd521d70498}, last_modified = {2021-11-12T08:30:19.550Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, source_type = {book}, private_publication = {false}, abstract = {We propose a set of novel methodologies which enable valid statistical hypothesis testing when we have only positive and unlabelled (PU) examples. This type of problem, a special case of semi-supervised data, is common in text mining, bioinformatics, and computer vision. Focusing on a generalised likelihood ratio test, we have 3 key contributions: (1) a proof that assuming all unlabelled examples are negative cases is sufficient for independence testing, but not for power analysis activities; (2) a new methodology that compensates this and enables power analysis, allowing sample size determination for observing an effect with a desired power; and finally, (3) a new capability, supervision determination, which can determine a-priori the number of labelled examples the user must collect before being able to observe a desired statistical effect. Beyond general hypothesis testing, we suggest the tools will additionally be useful for information theoretic feature selection, and Bayesian Network structure learning. © 2014 Springer-Verlag.}, bibtype = {book}, author = {Sechidis, K and Calvo, B and Brown, G} }

Downloads: 0