BKinD-3D: Self-Supervised 3D Keypoint Discovery from Multi-View Videos. Sun, J. J., Karashchuk, P., Dravid, A., Ryou, S., Fereidooni, S., Tuthill, J., Katsaggelos, A., Brunton, B. W., Gkioxari, G., Kennedy, A., Yue, Y., & Perona, P. arXiv preprint arXiv:2212.07401, dec, 2022.
BKinD-3D: Self-Supervised 3D Keypoint Discovery from Multi-View Videos [link]Paper  abstract   bibtex   
Quantifying motion in 3D is important for studying the behavior of humans and other animals, but manual pose annotations are expensive and time-consuming to obtain. Self-supervised keypoint discovery is a promising strategy for estimating 3D poses without annotations. However, current keypoint discovery approaches commonly process single 2D views and do not operate in the 3D space. We propose a new method to perform self-supervised keypoint discovery in 3D from multi-view videos of behaving agents, without any keypoint or bounding box supervision in 2D or 3D. Our method uses an encoder-decoder architecture with a 3D volumetric heatmap, trained to reconstruct spatiotemporal differences across multiple views, in addition to joint length constraints on a learned 3D skeleton of the subject. In this way, we discover keypoints without requiring manual supervision in videos of humans and rats, demonstrating the potential of 3D keypoint discovery for studying behavior.
@article{Jennifer2022,
abstract = {Quantifying motion in 3D is important for studying the behavior of humans and other animals, but manual pose annotations are expensive and time-consuming to obtain. Self-supervised keypoint discovery is a promising strategy for estimating 3D poses without annotations. However, current keypoint discovery approaches commonly process single 2D views and do not operate in the 3D space. We propose a new method to perform self-supervised keypoint discovery in 3D from multi-view videos of behaving agents, without any keypoint or bounding box supervision in 2D or 3D. Our method uses an encoder-decoder architecture with a 3D volumetric heatmap, trained to reconstruct spatiotemporal differences across multiple views, in addition to joint length constraints on a learned 3D skeleton of the subject. In this way, we discover keypoints without requiring manual supervision in videos of humans and rats, demonstrating the potential of 3D keypoint discovery for studying behavior.},
archivePrefix = {arXiv},
arxivId = {2212.07401},
author = {Sun, Jennifer J. and Karashchuk, Pierre and Dravid, Amil and Ryou, Serim and Fereidooni, Sonia and Tuthill, John and Katsaggelos, Aggelos and Brunton, Bingni W. and Gkioxari, Georgia and Kennedy, Ann and Yue, Yisong and Perona, Pietro},
eprint = {2212.07401},
journal = {arXiv preprint arXiv:2212.07401},
month = {dec},
title = {{BKinD-3D: Self-Supervised 3D Keypoint Discovery from Multi-View Videos}},
url = {http://arxiv.org/abs/2212.07401},
year = {2022}
}

Downloads: 0