<script src="https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/group/1ff583c0-be37-34fa-9c04-73c69437d354?jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/group/1ff583c0-be37-34fa-9c04-73c69437d354");
print_r($contents);
?>
<iframe src="https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/group/1ff583c0-be37-34fa-9c04-73c69437d354"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@article{ title = {Fast evaluation of real spherical harmonics and their derivatives in Cartesian coordinates}, type = {article}, year = {2023}, pages = {1-8}, websites = {http://arxiv.org/abs/2302.08381}, id = {fc9fc073-ca3e-3cb3-ac41-61f2af9c079e}, created = {2023-05-03T13:16:39.666Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.693Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bigi2023}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Spherical harmonics provide a smooth, orthogonal, and symmetry-adapted basis to expand functions on a sphere, and they are used routinely in computer graphics, signal processing and different fields of science, from geology to quantum chemistry. More recently, spherical harmonics have become a key component of rotationally equivariant models for geometric deep learning, where they are used in combination with distance-dependent functions to describe the distribution of neighbors within local spherical environments within a point cloud. We present a fast and elegant algorithm for the evaluation of the real-valued spherical harmonics. Our construction integrates many of the desirable features of existing schemes and allows to compute Cartesian derivatives in a numerically stable and computationally efficient manner. We provide an efficient C implementation of the proposed algorithm, along with easy-to-use Python bindings.}, bibtype = {article}, author = {Bigi, Filippo and Ceriotti, Michele} }
@article{ title = {Reducing SO(3) Convolutions to SO(2) for Efficient Equivariant GNNs}, type = {article}, year = {2023}, websites = {http://arxiv.org/abs/2302.03655}, id = {edcf5002-2e33-347a-bdc8-380683d5a8b0}, created = {2023-05-03T13:16:40.626Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.626Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Passaro2023}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Graph neural networks that model 3D data, such as point clouds or atoms, are typically desired to be $SO(3)$ equivariant, i.e., equivariant to 3D rotations. Unfortunately equivariant convolutions, which are a fundamental operation for equivariant networks, increase significantly in computational complexity as higher-order tensors are used. In this paper, we address this issue by reducing the $SO(3)$ convolutions or tensor products to mathematically equivalent convolutions in $SO(2)$ . This is accomplished by aligning the node embeddings' primary axis with the edge vectors, which sparsifies the tensor product and reduces the computational complexity from $O(L^6)$ to $O(L^3)$, where $L$ is the degree of the representation. We demonstrate the potential implications of this improvement by proposing the Equivariant Spherical Channel Network (eSCN), a graph neural network utilizing our novel approach to equivariant convolutions, which achieves state-of-the-art results on the large-scale OC-20 dataset.}, bibtype = {article}, author = {Passaro, Saro and Zitnick, C. Lawrence}, number = {3} }
@article{ title = {3D Spectral Domain Registration-Based Visual Servoing}, type = {article}, year = {2023}, websites = {http://arxiv.org/abs/2303.15857}, id = {c52d0131-474c-337a-8f32-b5cf4c113f79}, created = {2023-05-03T13:16:40.844Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.388Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Adjigble2023}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This paper presents a spectral domain registration-based visual servoing scheme that works on 3D point clouds. Specifically, we propose a 3D model/point cloud alignment method, which works by finding a global transformation between reference and target point clouds using spectral analysis. A 3D Fast Fourier Transform (FFT) in R3 is used for the translation estimation, and the real spherical harmonics in SO(3) are used for the rotations estimation. Such an approach allows us to derive a decoupled 6 degrees of freedom (DoF) controller, where we use gradient ascent optimisation to minimise translation and rotational costs. We then show how this methodology can be used to regulate a robot arm to perform a positioning task. In contrast to the existing state-of-the-art depth-based visual servoing methods that either require dense depth maps or dense point clouds, our method works well with partial point clouds and can effectively handle larger transformations between the reference and the target positions. Furthermore, the use of spectral data (instead of spatial data) for transformation estimation makes our method robust to sensor-induced noise and partial occlusions. We validate our approach by performing experiments using point clouds acquired by a robot-mounted depth camera. Obtained results demonstrate the effectiveness of our visual servoing approach.}, bibtype = {article}, author = {Adjigble, Maxime and Tamadazte, Brahim and de Farias, Cristiana and Stolkin, Rustam and Marturi, Naresh} }
@article{ title = {Evaluate Geometry of Radiance Field with Low-frequency Color Prior}, type = {article}, year = {2023}, websites = {http://arxiv.org/abs/2304.04351}, id = {2993599e-9859-3b83-8936-d71f252f2aa8}, created = {2023-05-03T13:16:40.951Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.361Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Fang2023}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Radiance field is an effective representation of 3D scenes, which has been widely adopted in novel-view synthesis and 3D reconstruction. It is still an open and challenging problem to evaluate the geometry, i.e., the density field, as the ground-truth is almost impossible to be obtained. One alternative indirect solution is to transform the density field into a point-cloud and compute its Chamfer Distance with the scanned ground-truth. However, many widely-used datasets have no point-cloud ground-truth since the scanning process along with the equipment is expensive and complicated. To this end, we propose a novel metric, named Inverse Mean Residual Color (IMRC), which can evaluate the geometry only with the observation images. Our key insight is that the better the geometry is, the lower-frequency the computed color field is. From this insight, given reconstructed density field and the observation images, we design a closed-form method to approximate the color field with low-frequency spherical harmonics and compute the inverse mean residual color. Then the higher the IMRC, the better the geometry. Qualitative and quantitative experimental results verify the effectiveness of our proposed IMRC metric. We also benchmark several state-of-the-art methods using IMRC to promote future related research.}, bibtype = {article}, author = {Fang, Qihang and Song, Yafei and Li, Keqiang and Shen, Li and Wu, Huaiyu and Xiong, Gang and Bo, Liefeng} }
@article{ title = {MACARONS: Mapping And Coverage Anticipation with RGB Online Self-Supervision}, type = {article}, year = {2023}, pages = {940-951}, websites = {http://arxiv.org/abs/2303.03315}, id = {3fbfd8cc-0cb7-3712-830a-5a9b02b198f0}, created = {2023-06-22T10:06:22.408Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-22T10:06:44.120Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {cbcad584-0c50-48fe-a0d7-5b4c781cba83}, private_publication = {false}, abstract = {We introduce a method that simultaneously learns to explore new large environments and to reconstruct them in 3D from color images only. This is closely related to the Next Best View problem (NBV), where one has to identify where to move the camera next to improve the coverage of an unknown scene. However, most of the current NBV methods rely on depth sensors, need 3D supervision and/or do not scale to large scenes. Our method requires only a color camera and no 3D supervision. It simultaneously learns in a self-supervised fashion to predict a "volume occupancy field" from color images and, from this field, to predict the NBV. Thanks to this approach, our method performs well on new scenes as it is not biased towards any training 3D data. We demonstrate this on a recent dataset made of various 3D scenes and show it performs even better than recent methods requiring a depth sensor, which is not a realistic assumption for outdoor scenes captured with a flying drone.}, bibtype = {article}, author = {Guédon, Antoine and Monnier, Tom and Monasse, Pascal and Lepetit, Vincent} }
@article{ title = {Change detection of urban objects using 3D point clouds: A review}, type = {article}, year = {2023}, keywords = {Applications,Change detection,Point clouds,Urban objects}, pages = {228-255}, volume = {197}, websites = {https://doi.org/10.1016/j.isprsjprs.2023.01.010}, publisher = {Elsevier B.V.}, id = {918a225a-8193-3641-872c-67f3fb8feac9}, created = {2023-06-22T10:06:22.411Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-22T10:06:55.898Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {cbcad584-0c50-48fe-a0d7-5b4c781cba83}, private_publication = {false}, abstract = {Over recent decades, 3D point clouds have been a popular data source applied in automatic change detection in a wide variety of applications. Compared with 2D images, using 3D point clouds for change detection can provide an alternative solution offering different modalities and enabling a highly detailed 3D geometric and attribute analysis. This article provides a comprehensive review of point-cloud-based 3D change detection for urban objects. Specifically, in this study, we had two primary aims: (i) to ascertain the critical techniques in change detection, as well as their strengths and weaknesses, including data registration, variance estimation, and change analysis; (ii) to contextualize the up-to-date uses of point clouds in change detection and to explore representative applications of land cover and land use monitoring, vegetation surveys, construction automation, building and indoor investigations, and traffic and transportation monitoring. A workflow following the PRISMA 2020 rules was applied for the search and selection of reviewed articles, with a brief statistical analysis of the selected articles. Additionally, we examined the limitations of current change detection technology and discussed current research gaps between state-of-the-art techniques and engineering demands. Several remaining issues, such as the reliability of datasets, uncertainty in results, and contribution of semantics in change detection, have been identified and discussed. Ultimately, this review sheds light on prospective research directions to meet the urgent needs of anticipated applications.}, bibtype = {article}, author = {Stilla, Uwe and Xu, Yusheng}, doi = {10.1016/j.isprsjprs.2023.01.010}, journal = {ISPRS Journal of Photogrammetry and Remote Sensing}, number = {February} }
@article{ title = {Make-It-3D: High-Fidelity 3D Creation from A Single Image with Diffusion Prior}, type = {article}, year = {2023}, websites = {http://arxiv.org/abs/2303.14184}, id = {b38659d8-07cd-3bee-a402-4775596c3600}, created = {2023-06-22T10:06:22.417Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-22T10:06:50.529Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {cbcad584-0c50-48fe-a0d7-5b4c781cba83}, private_publication = {false}, abstract = {In this work, we investigate the problem of creating high-fidelity 3D content from only a single image. This is inherently challenging: it essentially involves estimating the underlying 3D geometry while simultaneously hallucinating unseen textures. To address this challenge, we leverage prior knowledge from a well-trained 2D diffusion model to act as 3D-aware supervision for 3D creation. Our approach, Make-It-3D, employs a two-stage optimization pipeline: the first stage optimizes a neural radiance field by incorporating constraints from the reference image at the frontal view and diffusion prior at novel views; the second stage transforms the coarse model into textured point clouds and further elevates the realism with diffusion prior while leveraging the high-quality textures from the reference image. Extensive experiments demonstrate that our method outperforms prior works by a large margin, resulting in faithful reconstructions and impressive visual quality. Our method presents the first attempt to achieve high-quality 3D creation from a single image for general objects and enables various applications such as text-to-3D creation and texture editing.}, bibtype = {article}, author = {Tang, Junshu and Wang, Tengfei and Zhang, Bo and Zhang, Ting and Yi, Ran and Ma, Lizhuang and Chen, Dong} }
@article{ title = {A Rapid Water Region Reconstruction Scheme in 3D Watershed Scene Generated by UAV Oblique Photography}, type = {article}, year = {2023}, keywords = {3D reconstruction,oblique photography,real-scene 3D environment,twin watershed,water region}, pages = {1-19}, volume = {15}, id = {d12bc583-1847-3672-a356-a2443ee891e3}, created = {2023-06-22T10:06:22.562Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-22T10:07:04.855Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {cbcad584-0c50-48fe-a0d7-5b4c781cba83}, private_publication = {false}, abstract = {Oblique photography technology based on UAV (unmanned aerial vehicle) provides an effective means for the rapid, real-scene 3D reconstruction of geographical objects on a watershed scale. However, existing research cannot achieve the automatic and high-precision reconstruction of water regions due to the sensitivity of water surface patterns to wind and waves, reflections of objects on the shore, etc. To solve this problem, a novel rapid reconstruction scheme for water regions in 3D models of oblique photography is proposed in this paper. It extracts the boundaries of water regions firstly using a designed eight-neighborhood traversal algorithm, and then reconstructs the triangulated irregular network (TIN) of water regions. Afterwards, the corresponding texture images of water regions are intelligently selected and processed using a designed method based on coordinate matching, image stitching and clipping. Finally, the processed texture images are mapped to the obtained TIN, and the real information about water regions can be reconstructed, visualized and integrated into the original real-scene 3D environment. Experimental results have shown that the proposed scheme can rapidly and accurately reconstruct water regions in 3D models of oblique photography. The outcome of this work can refine the current technical system of 3D modeling by UAV oblique photography and expand its application in the construction of twin watershed, twin city, etc.}, bibtype = {article}, author = {Qiu, Yinguo and Jiao, Yaqin and Luo, Juhua and Tan, Zhenyu and Huang, Linsheng and Zhao, Jinling and Xiao, Qitao and Duan, Hongtao}, doi = {10.3390/rs15051211}, journal = {Remote Sensing}, number = {5} }
@article{ title = {An FPGA smart camera implementation of segmentation models for drone wildfire imagery}, type = {article}, year = {2023}, websites = {http://arxiv.org/abs/2309.01318}, month = {9}, day = {3}, id = {5bb35d83-61db-35bf-9393-8f6d34449d6e}, created = {2023-11-07T10:04:20.036Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:18:33.393Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {Use Vitis AI for DL image processing<br/>FPGA used : Xilinx Ultra96-V2}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Wildfires represent one of the most relevant natural disasters worldwide, due to their impact on various societal and environmental levels. Thus, a significant amount of research has been carried out to investigate and apply computer vision techniques to address this problem. One of the most promising approaches for wildfire fighting is the use of drones equipped with visible and infrared cameras for the detection, monitoring, and fire spread assessment in a remote manner but in close proximity to the affected areas. However, implementing effective computer vision algorithms on board is often prohibitive since deploying full-precision deep learning models running on GPU is not a viable option, due to their high power consumption and the limited payload a drone can handle. Thus, in this work, we posit that smart cameras, based on low-power consumption field-programmable gate arrays (FPGAs), in tandem with binarized neural networks (BNNs), represent a cost-effective alternative for implementing onboard computing on the edge. Herein we present the implementation of a segmentation model applied to the Corsican Fire Database. We optimized an existing U-Net model for such a task and ported the model to an edge device (a Xilinx Ultra96-v2 FPGA). By pruning and quantizing the original model, we reduce the number of parameters by 90%. Furthermore, additional optimizations enabled us to increase the throughput of the original model from 8 frames per second (FPS) to 33.63 FPS without loss in the segmentation performance: our model obtained 0.912 in Matthews correlation coefficient (MCC),0.915 in F1 score and 0.870 in Hafiane quality index (HAF), and comparable qualitative segmentation results when contrasted to the original full-precision model. The final model was integrated into a low-cost FPGA, which was used to implement a neural network accelerator.}, bibtype = {article}, author = {Guarduño-Martinez, Eduardo and Ciprian-Sanchez, Jorge and Valente, Gerardo and Vazquez-Garcia, undefined and Rodriguez-Hernandez, Gerardo and Palacios-Rosas, Adriana and Rossi-Tisson, Lucile and Ochoa-Ruiz, Gilberto} }
@inproceedings{ title = {Convolutional Neural Networks on the Edge: A Comparison Between FPGA and GPU}, type = {inproceedings}, year = {2023}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, id = {e0fd97ea-4e2b-36c4-9f56-0fb0e3190e79}, created = {2023-11-07T10:04:43.974Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:16:30.682Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {With more computation tasks being pushed towards the edge, it becomes increasingly challenging to decide the right hardware approach for running edge AI tasks. Besides customized AI accelerators, existing hardware platforms such as FPGAs have been employed to support edge computing. Recently, small GPUs have also been developed to run AI tasks that are tailored for edge. While each platform exhibits its own advantages, the shortcomings are also identified. In this work, we present a study where a head to head comparison is conducted by running the same CNN on two popular edge platforms, FPGAs and GPUs. We compare multiple dimensions such as power, inference speed, ease of development and accuracy. The comparison results can provide an initial guidance for edge computing developers who don't have immediate access to both platforms.}, bibtype = {inproceedings}, author = {Wei, Yichen and Gong, Siyi and Mei, Hongfei and Shi, Longxing and Guo, Xinfei}, doi = {10.1109/CSTIC58779.2023.10219304}, booktitle = {2023 China Semiconductor Technology International Conference, CSTIC 2023} }
@article{ title = {Benchmarking edge computing devices for grape bunches and trunks detection using accelerated object detection single shot multibox deep learning models}, type = {article}, year = {2023}, keywords = {Embedded systems,Heterogeneous platforms,Object detection,RetinaNet resNet,SSD resNet}, volume = {117}, month = {1}, publisher = {Elsevier Ltd}, day = {1}, id = {a252719c-4909-323a-bc20-37001c166cf5}, created = {2023-11-16T13:12:41.041Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-02-07T11:37:17.518Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {Benchmarking on GPU FPGA devices<br/>Tested on Vine grapes images - measure FPS<br/><br/>Materials Methods:<br/>DL was built and trained in <b>TensorFlow 2.8 Keras</b>.<br/><b>TF-TRT</b> NVIDIA library that operates with TensorFlow and TensorRT (TRT)<br/><b>Vitis-AI</b>: quantised model INT8 weights and converted to a readable <b>DPU format</b><br/><br/><b>RetinaNet ResNet-50</b> archs used}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Purpose: Visual perception enables robots to perceive the environment. Visual data is processed using computer vision algorithms that are usually time-expensive and require powerful devices to process the visual data in real-time, which is unfeasible for open-field robots with limited energy. This work benchmarks the performance of different heterogeneous platforms for object detection in real-time. This research benchmarks three architectures: embedded GPU—Graphical Processing Units (such as NVIDIA Jetson Nano 2GB and 4GB, and NVIDIA Jetson TX2), TPU—Tensor Processing Unit (such as Coral Dev Board TPU), and DPU—Deep Learning Processor Unit (such as in AMD/Xilinx ZCU104 Development Board, and AMD/Xilinx Kria KV260 Starter Kit). Methods: The authors used the RetinaNet ResNet-50 fine-tuned using the natural VineSet dataset. After the trained model was converted and compiled for target-specific hardware formats to improve the execution efficiency. Conclusions and Results: The platforms were assessed in terms of performance of the evaluation metrics and efficiency (time of inference). Graphical Processing Units (GPUs) were the slowest devices, running at 3FPS to 5FPS, and Field Programmable Gate Arrays (FPGAs) were the fastest devices, running at 14FPS to 25FPS. The efficiency of the Tensor Processing Unit (TPU) is irrelevant and similar to NVIDIA Jetson TX2. TPU and GPU are the most power-efficient, consuming about 5W. The performance differences, in the evaluation metrics, across devices are irrelevant and have an F1 of about 70% and mean Average Precision (mAP) of about 60%.}, bibtype = {article}, author = {Magalhães, Sandro Costa and dos Santos, Filipe Neves and Machado, Pedro and Moreira, António Paulo and Dias, Jorge}, doi = {10.1016/j.engappai.2022.105604}, journal = {Engineering Applications of Artificial Intelligence} }
@article{ title = {Graph-based deep learning for communication networks: A survey}, type = {article}, year = {2022}, keywords = {Communication network,Deep learning,Graph,Graph Neural Network,Software Defined Networking}, pages = {40-54}, volume = {185}, id = {00ef2aa4-8ad1-380b-9827-56c04267fc2c}, created = {2022-03-01T12:39:37.991Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-01T12:39:41.915Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {Communication networks are important infrastructures in contemporary society. There are still many challenges that are not fully solved and new solutions are proposed continuously in this active research area. In recent years, to model the network topology, graph-based deep learning has achieved the state-of-the-art performance in a series of problems in communication networks. In this survey, we review the rapidly growing body of research using different graph-based deep learning models, e.g. graph convolutional and graph attention networks, in various problems from different types of communication networks, e.g. wireless networks, wired networks, and software defined networks. We also present a well-organized list of the problem and solution for each study and identify future research directions. To the best of our knowledge, this paper is the first survey that focuses on the application of graph-based deep learning methods in communication networks involving both wired and wireless scenarios. To track the follow-up research, a public GitHub repository is created, where the relevant papers will be updated continuously.}, bibtype = {article}, author = {Jiang, Weiwei}, doi = {10.1016/j.comcom.2021.12.015}, journal = {Computer Communications} }
@article{ title = {Overhead Reduction for Graph-Based Point Cloud Delivery Using Non-Uniform Quantization}, type = {article}, year = {2022}, id = {13089d7b-32dc-3ac3-b429-8838b67f77fd}, created = {2022-03-02T07:02:50.194Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T07:03:04.305Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, bibtype = {article}, author = {Electric, Mitsubishi} }
@article{ title = {44444 BottleFit : Learning Compressed Representations in Deep Neural Networks for Effective and Efficient Split Computing}, type = {article}, year = {2022}, id = {6777c76c-e9a9-3352-8502-8a24e7d4133f}, created = {2022-03-04T09:00:49.998Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T07:18:42.939Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449,a6e140dd-a959-4148-9ed8-b29d0c7966c6}, private_publication = {false}, bibtype = {article}, author = {Callegaro, Davide and Levorato, Marco} }
@article{ title = {Unsupervised Learning on 3D Point Clouds by Clustering and Contrasting}, type = {article}, year = {2022}, pages = {1-11}, volume = {14}, websites = {http://arxiv.org/abs/2202.02543}, id = {2a89ccc6-008a-3a8d-94ec-567f6ce83b06}, created = {2022-03-10T14:04:37.039Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-18T10:02:58.606Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5,5041aa39-a3cf-45bd-ada3-df1401e124f1}, private_publication = {false}, abstract = {Learning from unlabeled or partially labeled data to alleviate human labeling remains a challenging research topic in 3D modeling. Along this line, unsupervised representation learning is a promising direction to auto-extract features without human intervention. This paper proposes a general unsupervised approach, named \textbfConClu, to perform the learning of point-wise and global features by jointly leveraging point-level clustering and instance-level contrasting. Specifically, for one thing, we design an Expectation-Maximization (EM) like soft clustering algorithm that provides local supervision to extract discriminating local features based on optimal transport. We show that this criterion extends standard cross-entropy minimization to an optimal transport problem, which we solve efficiently using a fast variant of the Sinkhorn-Knopp algorithm. For another, we provide an instance-level contrasting method to learn the global geometry, which is formulated by maximizing the similarity between two augmentations of one point cloud. Experimental evaluations on downstream applications such as 3D object classification and semantic segmentation demonstrate the effectiveness of our framework and show that it can outperform state-of-the-art techniques.}, bibtype = {article}, author = {Mei, Guofeng and Yu, Litao and Wu, Qiang and Zhang, Jian and Bennamoun, Mohammed}, number = {8} }
@article{ title = {Anytime 3D Object Reconstruction Using}, type = {article}, year = {2022}, pages = {2162-2169}, volume = {7}, publisher = {IEEE}, id = {e6e45b22-ff4f-3616-baaf-a915c24f5123}, created = {2022-03-23T06:17:59.519Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:21:33.125Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Autoencoder2022}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Yu, undefined}, number = {2} }
@article{ title = {Geometric Transformer for Fast and Robust Point Cloud Registration}, type = {article}, year = {2022}, websites = {https://arxiv.org/abs/2202.06688v2}, month = {2}, day = {14}, id = {d3d1576f-8b97-3f6c-ab4f-614194058b2e}, created = {2022-03-28T07:19:05.800Z}, accessed = {2022-03-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-31T06:33:43.733Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We study the problem of extracting accurate correspondences for point cloud registration. Recent keypoint-free methods bypass the detection of repeatable keypoints which is difficult in low-overlap scenarios, showing great potential in registration. They seek correspondences over downsampled superpoints, which are then propagated to dense points. Superpoints are matched based on whether their neighboring patches overlap. Such sparse and loose matching requires contextual features capturing the geometric structure of the point clouds. We propose Geometric Transformer to learn geometric feature for robust superpoint matching. It encodes pair-wise distances and triplet-wise angles, making it robust in low-overlap cases and invariant to rigid transformation. The simplistic design attains surprisingly high matching accuracy such that no RANSAC is required in the estimation of alignment transformation, leading to $100$ times acceleration. Our method improves the inlier ratio by $17\sim30$ percentage points and the registration recall by over $7$ points on the challenging 3DLoMatch benchmark. Our code and models are available at \urlhttps://github.com/qinzheng93/GeoTransformer.}, bibtype = {article}, author = {Qin, Zheng and Yu, Hao and Wang, Changjian and Guo, Yulan and Peng, Yuxing and Xu, Kai}, doi = {10.48550/arxiv.2202.06688} }
@article{ title = {Geometric Transformer for Fast and Robust Point Cloud Registration}, type = {article}, year = {2022}, websites = {https://arxiv.org/abs/2202.06688v2}, month = {2}, day = {14}, id = {a48947ef-c596-36d9-86ae-e4e46f8afa3c}, created = {2022-03-28T07:20:03.106Z}, accessed = {2022-03-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T06:51:19.995Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {We study the problem of extracting accurate correspondences for point cloud registration. Recent keypoint-free methods bypass the detection of repeatable keypoints which is difficult in low-overlap scenarios, showing great potential in registration. They seek correspondences over downsampled superpoints, which are then propagated to dense points. Superpoints are matched based on whether their neighboring patches overlap. Such sparse and loose matching requires contextual features capturing the geometric structure of the point clouds. We propose Geometric Transformer to learn geometric feature for robust superpoint matching. It encodes pair-wise distances and triplet-wise angles, making it robust in low-overlap cases and invariant to rigid transformation. The simplistic design attains surprisingly high matching accuracy such that no RANSAC is required in the estimation of alignment transformation, leading to $100$ times acceleration. Our method improves the inlier ratio by $17\sim30$ percentage points and the registration recall by over $7$ points on the challenging 3DLoMatch benchmark. Our code and models are available at \urlhttps://github.com/qinzheng93/GeoTransformer.}, bibtype = {article}, author = {Qin, Zheng and Yu, Hao and Wang, Changjian and Guo, Yulan and Peng, Yuxing and Xu, Kai}, doi = {10.48550/arxiv.2202.06688} }
@article{ title = {Anytime 3D Object Reconstruction Using Multi-Modal Variational Autoencoder}, type = {article}, year = {2022}, keywords = {3D object reconstruction,Decoding,Estimation,Real-time systems,Shape,Three-dimensional displays,Training,Visualization,anytime algorithm,data imputation,multi-modal variational autoencoder}, pages = {2162-2169}, volume = {7}, month = {4}, id = {c0654fc0-7760-378e-98bf-49a4a3500953}, created = {2022-03-28T09:45:04.459Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:46.713Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yuAnytime3DObject2022}, source_type = {article}, notes = {Conference Name: IEEE Robotics and Automation Letters}, private_publication = {false}, abstract = {For effective human-robot teaming, it is important for the robots to be able to share their visual perception with the human operators. In a harsh remote collaboration setting, data compression techniques such as autoencoder can be utilized to obtain and transmit the data in terms of latent variables in a compact form. In addition, to ensure real-time runtime performance even under unstable environments, an anytime estimation approach is desired that can reconstruct the full contents from incomplete information. In this context, we propose a method for imputation of latent variables whose elements are partially lost. To achieve the anytime property with only a few dimensions of variables, exploiting prior information of the category-level is essential. A prior distribution used in variational autoencoders is simply assumed to be isotropic Gaussian regardless of the labels of each training datapoint. This type of flattened prior makes it difficult to perform imputation from the category-level distributions. We overcome this limitation by exploiting a category-specific multi-modal prior distribution in the latent space. The missing elements of the partially transferred data can be sampled, by finding a specific modal according to the remaining elements. Since the method is designed to use partial elements for anytime estimation, it can also be applied for data over-compression. Based on the experiments on the ModelNet and Pascal3D datasets, the proposed approach shows consistently superior performance over autoencoder and variational autoencoder up to 70\% data loss. The software is open source and is available from our repository1.}, bibtype = {article}, author = {Yu, Hyeonwoo and Oh, Jean}, doi = {10.1109/LRA.2022.3142439}, journal = {IEEE Robotics and Automation Letters}, number = {2} }
@article{ title = {Hierarchical Graph-Convolutional Variational AutoEncoding for Generative Modelling of Human Motion}, type = {article}, year = {2022}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Re,Computer Science - Machine Learning,Mathematics - Probability}, websites = {http://arxiv.org/abs/2111.12602}, month = {1}, id = {fdf7fca7-30dc-3a22-8fd2-54432c3635a5}, created = {2022-03-28T09:45:06.188Z}, accessed = {2022-03-22}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:25.004Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bourachedHierarchicalGraphConvolutionalVariational2022}, source_type = {article}, notes = {arXiv: 2111.12602}, private_publication = {false}, abstract = {Models of human motion commonly focus either on trajectory prediction or action classification but rarely both. The marked heterogeneity and intricate compositionality of human motion render each task vulnerable to the data degradation and distributional shift common to real-world scenarios. A sufficiently expressive generative model of action could in theory enable data conditioning and distributional resilience within a unified framework applicable to both tasks. Here we propose a novel architecture based on hierarchical variational autoencoders and deep graph convolutional neural networks for generating a holistic model of action over multiple time-scales. We show this Hierarchical Graph-convolutional Variational Autoencoder (HG-VAE) to be capable of generating coherent actions, detecting out-of-distribution data, and imputing missing data by gradient ascent on the model's posterior. Trained and evaluated on H3.6M and the largest collection of open source human motion data, AMASS, we show HG-VAE can facilitate downstream discriminative learning better than baseline models.}, bibtype = {article}, author = {Bourached, Anthony and Gray, Robert and Griffiths, Ryan-Rhys and Jha, Ashwani and Nachev, Parashkev}, journal = {arXiv:2111.12602 [cs, math]} }
@article{ title = {A proposal of edge detection in images with multiplicative noise using the Ant Colony System algorithm}, type = {article}, year = {2022}, keywords = {Ant Colony System,Coefficient of variation,Edge detection,Multiplicative noise}, pages = {104715}, volume = {110}, websites = {https://doi.org/10.1016/j.engappai.2022.104715}, publisher = {Elsevier Ltd}, id = {6d2fa72f-c12c-3608-b7ba-d85c1d5b9fe9}, created = {2022-04-05T05:35:07.814Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:07.814Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Multiplicative noise is one of the most aggressive types of noise present in various types of images: Synthetic Aperture Radar, Ultrasound images, Ultrasonic Imaging, among others. Edges detectors such as Canny or Sobel are not very efficient for processing an image with multiplicative noise, they also require filtering algorithms, a preprocessing, which is why bio-inspired algorithms are an alternative for processing images with the presence of multiplicative noise, due to its efficiency in finding an approximate solution. This article proposes a method for the edges detection in images with multiplicative noise using the Ant Colony System algorithm. For which we must adapt the Ant Colony System algorithm to detect contours, this we define the calculation of a global pheromone matrix between several edge detection equations, gradient, and the coefficient of variation, these equations are compared for their edge detection performance using a visual inspection and a performance function. The results of the experiments show a correct implementation of the algorithm proposed to the images with multiplicative noise even for high noise levels.}, bibtype = {article}, author = {Baltierra, Sergio and Valdebenito, Jonathan and Mora, Marco}, doi = {10.1016/j.engappai.2022.104715}, journal = {Engineering Applications of Artificial Intelligence}, number = {February} }
@article{ title = {Deep Architectures for Image Compression: A Critical Review}, type = {article}, year = {2022}, keywords = {CNN,DNN,Deep learning,Image compression,Review,Survey}, pages = {108346}, volume = {191}, websites = {https://doi.org/10.1016/j.sigpro.2021.108346}, publisher = {Elsevier B.V.}, id = {69c71055-7c3d-3584-9266-0b4d85a31944}, created = {2022-04-05T05:35:07.934Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.609Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {Deep learning architectures are now pervasive and filled almost all applications under image processing, computer vision, and biometrics. The attractive property of feature extraction of CNN has solved a lot of conventional image processing problems with much-improved performance & efficiency. The paper aimed to review over a hundred recent state-of-the-art techniques exploiting mostly lossy image compression using deep learning architectures. These deep learning algorithms consists of various architectures like CNN, RNN, GAN, autoencoders and variational autoencoders. We have classified all the algorithms under certain categories for the better and deep understanding. The review is written keeping in mind the contributions of researchers & the challenges faced by them. Various findings for the researchers along with some future directions for a new researcher have been significantly highlighted. Most of the papers reviewed in the compression domain are from the last four years using different methodologies. The review has been summarized by dropping a new outlook for researchers in the realm of image compression.}, bibtype = {article}, author = {Mishra, Dipti and Singh, Satish Kumar and Singh, Rajat Kumar}, doi = {10.1016/j.sigpro.2021.108346}, journal = {Signal Processing} }
@article{ title = {A comprehensive survey of clustering algorithms: State-of-the-art machine learning applications, taxonomy, challenges, and future research prospects}, type = {article}, year = {2022}, keywords = {Automatic clustering,Clustering,Clustering algorithms, partitioning,Data mining,Hierarchical clustering,K-Means,Optimization algorithms, Machine learning,Supervised learning,Unsupervised learning}, pages = {104743}, volume = {110}, websites = {https://doi.org/10.1016/j.engappai.2022.104743}, publisher = {Elsevier Ltd}, id = {cebed498-d956-3357-8a0e-6cdd88e9fa4f}, created = {2022-04-05T05:35:07.964Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.809Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {Clustering is an essential tool in data mining research and applications. It is the subject of active research in many fields of study, such as computer science, data science, statistics, pattern recognition, artificial intelligence, and machine learning. Several clustering techniques have been proposed and implemented, and most of them successfully find excellent quality or optimal clustering results in the domains mentioned earlier. However, there has been a gradual shift in the choice of clustering methods among domain experts and practitioners alike, which is precipitated by the fact that most traditional clustering algorithms still depend on the number of clusters provided a priori. These conventional clustering algorithms cannot effectively handle real-world data clustering analysis problems where the number of clusters in data objects cannot be easily identified. Also, they cannot effectively manage problems where the optimal number of clusters for a high-dimensional dataset cannot be easily determined. Therefore, there is a need for improved, flexible, and efficient clustering techniques. Recently, a variety of efficient clustering algorithms have been proposed in the literature, and these algorithms produced good results when evaluated on real-world clustering problems. This study presents an up-to-date systematic and comprehensive review of traditional and state-of-the-art clustering techniques for different domains. This survey considers clustering from a more practical perspective. It shows the outstanding role of clustering in various disciplines, such as education, marketing, medicine, biology, and bioinformatics. It also discusses the application of clustering to different fields attracting intensive efforts among the scientific community, such as big data, artificial intelligence, and robotics. This survey paper will be beneficial for both practitioners and researchers. It will serve as a good reference point for researchers and practitioners to design improved and efficient state-of-the-art clustering algorithms.}, bibtype = {article}, author = {Ezugwu, Absalom E. and Ikotun, Abiodun M. and Oyelade, Olaide O. and Abualigah, Laith and Agushaka, Jeffery O. and Eke, Christopher I. and Akinyelu, Andronicus A.}, doi = {10.1016/j.engappai.2022.104743}, journal = {Engineering Applications of Artificial Intelligence}, number = {February} }
@article{ title = {IntroVAC: Introspective Variational Classifiers for learning interpretable latent subspaces}, type = {article}, year = {2022}, keywords = {Attribute manipulation,Autoencoders,Deep learning,Interpretability}, pages = {104658}, volume = {109}, websites = {https://doi.org/10.1016/j.engappai.2021.104658}, publisher = {Elsevier Ltd}, id = {8d07907f-4b5d-362f-bee4-61a1c7419a06}, created = {2022-04-05T05:35:08.068Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:31.615Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Learning useful representations of complex data has been the subject of extensive research for many years. In particular, with the diffusion of complex Deep Learning-based approaches in engineering applications, the possibility to interpret, to a certain degree, model predictions is of fundamental importance for both the model users and developers. In the context of Deep Neural Networks, Variational Autoencoders have gained lots of attention since they provide an explicit model of the data distribution based on an encoder/decoder architecture which is able to both generate images and encode them in a low-dimensional subspace. However, the latent space is not easily interpretable and the generation capabilities show some limitations since images typically look blurry and lack details. In this paper, we propose the Introspective Variational Classifier (IntroVAC), a model that learns interpretable latent subspaces by exploiting information from an additional label and provides improved image quality thanks to an adversarial training strategy. We show that IntroVAC is able to learn meaningful directions in the latent space enabling fine-grained manipulation of image attributes. We validated our approach on the CelebA dataset. When compared with standard Variational Autoencoder Classifiers, the proposed approach outperform them by achieving a Frechét Inception Distance of 25.5 versus a value of 63.9.}, bibtype = {article}, author = {Maggipinto, Marco and Terzi, Matteo and Susto, Gian Antonio}, doi = {10.1016/j.engappai.2021.104658}, journal = {Engineering Applications of Artificial Intelligence}, number = {April 2021} }
@article{ title = {A comprehensive survey on 3D face recognition methods}, type = {article}, year = {2022}, keywords = {3D face recognition,Deep learning,Expression,Occlusion,Pose,Survey}, pages = {104669}, volume = {110}, websites = {https://doi.org/10.1016/j.engappai.2022.104669}, publisher = {Elsevier Ltd}, id = {f6985205-3628-37d9-bcf7-bb3329bd323e}, created = {2022-04-05T05:35:08.081Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:37.818Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {3D face recognition (3DFR) has emerged as an effective means of characterizing facial identity over the past several decades. Depending on the types of techniques used in recognition, these methods are categorized into traditional and modern. The former generally extract distinctive facial features (e.g. global, local, and hybrid features) for matching, whereas the latter rely primarily on deep learning to perform 3DFR in an end-to-end way. Many literature surveys have been carried out reviewing either traditional or modern methods alone, while only a few studies are conducted simultaneously on both of them. This survey presents a state-of-the-art for 3DFR covering both traditional and modern methods, focusing on the techniques used in face processing, feature extraction, and classification. In addition, we review some specific face recognition challenges, including pose, illumination, expression variations, self-occlusion, and spoofing attack. The commonly used 3D face datasets have been summarized as well.}, bibtype = {article}, author = {Li, Menghan and Huang, Bin and Tian, Guohui}, doi = {10.1016/j.engappai.2022.104669}, journal = {Engineering Applications of Artificial Intelligence}, number = {October 2021} }
@article{ title = {A novel vision-based weakly supervised framework for autonomous yield estimation in agricultural applications}, type = {article}, year = {2022}, keywords = {Automatic yield estimation,Autonomous systems,Visual learning,Weakly-supervised learning}, pages = {104615}, volume = {109}, websites = {https://doi.org/10.1016/j.engappai.2021.104615}, publisher = {Elsevier Ltd}, id = {8d884a27-cef4-339e-952a-134a888f727f}, created = {2022-04-05T05:35:08.119Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:08.119Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Autonomous systems have been established as a ground-breaking technology in agriculture, particularly for resource optimization and labor savings. However, even those solutions that are limited to monitoring activities, such as yield estimation, rely on costly robotic platforms equipped with a series of range devices (e.g., LIDAR and GPS-RTK). Recently, vision-based strategies have gained considerable attention as a less expensive and more efficient alternative, capable to be on par with or even surpass approaches that benefit from range sensors. Nonetheless, they exploit deep learning methodologies, which require burdensome labeling procedures to perform training. To address these shortcomings, we present a novel approach that performs yield estimation requiring only a monocular camera and needs a limited amount of supervision information. It detects, locates and maps fruits and tree canopies to estimate the total yield of a specific crop. To keep the image labeling effort to a minimum, we propose a weakly-supervision paradigm that only requires a simple binary label encoding the presence or the absence of fruits in the training images. Our approach does not make any assumptions on the underlying platform, i.e., it can be used by collecting images either with a hand-held camera or with an autonomous robot. Therefore, we are able to considerably reduce the deployment time, the energy and the cost of the overall yield estimation system. At the same time, we keep the performance comparable to both vision-based fully supervised baselines (which require costly labeling operations) and classical systems that rely on more expensive and power-demanding sensors.}, bibtype = {article}, author = {Bellocchio, Enrico and Crocetti, Francesco and Costante, Gabriele and Fravolini, Mario Luca and Valigi, Paolo}, doi = {10.1016/j.engappai.2021.104615}, journal = {Engineering Applications of Artificial Intelligence}, number = {April 2021} }
@article{ title = {DeltaConv : Anisotropic Geometric Deep Learning with Exterior Calculus}, type = {article}, year = {2022}, pages = {1-12}, volume = {1}, id = {bec52d02-416e-3ad6-9aa5-11ce0f2a5f22}, created = {2022-05-02T08:14:58.363Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:14:58.363Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Wiersma, Ruben}, number = {1} }
@article{ title = {OverlapNet: a siamese network for computing LiDAR scan similarity with applications to loop closing and localization}, type = {article}, year = {2022}, keywords = {Localization,Loop closing,SLAM}, pages = {61-81}, volume = {46}, websites = {https://doi.org/10.1007/s10514-021-09999-0}, publisher = {Springer US}, id = {6e49a62f-206c-329e-8b27-f8cac3de6f67}, created = {2022-07-05T12:32:33.765Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-18T15:28:25.353Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {Localization and mapping are key capabilities of autonomous systems. In this paper, we propose a modified Siamese network to estimate the similarity between pairs of LiDAR scans recorded by autonomous cars. This can be used to address both, loop closing for SLAM and global localization. Our approach utilizes a deep neural network exploiting different cues generated from LiDAR data. It estimates the similarity between pairs of scans using the concept of image overlap generalized to range images and furthermore provides a relative yaw angle estimate. Based on such predictions, our method is able to detect loop closures in a SLAM system or to globally localize in a given map. For loop closure detection, we use the overlap prediction as the similarity measurement to find loop closure candidates and integrate the candidate selection into an existing SLAM system to improve the mapping performance. For global localization, we propose a novel observation model using the predictions provided by OverlapNet and integrate it into a Monte-Carlo localization framework. We evaluate our approach on multiple datasets collected using different LiDAR scanners in various environments. The experimental results show that our method can effectively detect loop closures surpassing the detection performance of state-of-the-art methods and that it generalizes well to different environments. Furthermore, our method reliably localizes a vehicle in typical urban environments globally using LiDAR data collected in different seasons.}, bibtype = {article}, author = {Chen, Xieyuanli and Läbe, Thomas and Milioto, Andres and Röhling, Timo and Behley, Jens and Stachniss, Cyrill}, doi = {10.1007/s10514-021-09999-0}, journal = {Autonomous Robots}, number = {1} }
@book{ title = {Attention, please! A survey of neural attention models in deep learning}, type = {book}, year = {2022}, source = {Artificial Intelligence Review}, keywords = {Attention mechanism,Attention models,Deep learning,Neural networks,Survey}, issue = {0123456789}, websites = {https://doi.org/10.1007/s10462-022-10148-x}, publisher = {Springer Netherlands}, id = {f50a7c3c-e1d3-3cca-8cae-7eb7ab1008e6}, created = {2022-07-22T12:20:02.925Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-22T12:20:14.866Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {In humans, Attention is a core property of all perceptual and cognitive operations. Given our limited ability to process competing sources, attention mechanisms select, modulate, and focus on the information most relevant to behavior. For decades, concepts and functions of attention have been studied in philosophy, psychology, neuroscience, and computing. For the last 6 years, this property has been widely explored in deep neural networks. Currently, the state-of-the-art in Deep Learning is represented by neural attention models in several application domains. This survey provides a comprehensive overview and analysis of developments in neural attention models. We systematically reviewed hundreds of architectures in the area, identifying and discussing those in which attention has shown a significant impact. We also developed and made public an automated methodology to facilitate the development of reviews in the area. By critically analyzing 650 works, we describe the primary uses of attention in convolutional, recurrent networks, and generative models, identifying common subgroups of uses and applications. Furthermore, we describe the impact of attention in different application domains and their impact on neural networks’ interpretability. Finally, we list possible trends and opportunities for further research, hoping that this review will provide a succinct overview of the main attentional models in the area and guide researchers in developing future approaches that will drive further improvements.}, bibtype = {book}, author = {de Santana Correia, Alana and Colombini, Esther Luna}, doi = {10.1007/s10462-022-10148-x} }
@book{ title = {Machine Learning in Drug Discovery: A Review}, type = {book}, year = {2022}, source = {Artificial Intelligence Review}, keywords = {Artificial intelligence,Digital pathology,Drug discovery,Machine learning,Prognostic biomarkers,Target validation}, pages = {1947-1999}, volume = {55}, issue = {3}, websites = {https://doi.org/10.1007/s10462-021-10058-4}, publisher = {Springer Netherlands}, id = {897e9a5e-ffe1-306e-9b62-d457de4b87ff}, created = {2022-07-22T12:20:02.931Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-22T12:20:12.962Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {This review provides the feasible literature on drug discovery through ML tools and techniques that are enforced in every phase of drug development to accelerate the research process and deduce the risk and expenditure in clinical trials. Machine learning techniques improve the decision-making in pharmaceutical data across various applications like QSAR analysis, hit discoveries, de novo drug architectures to retrieve accurate outcomes. Target validation, prognostic biomarkers, digital pathology are considered under problem statements in this review. ML challenges must be applicable for the main cause of inadequacy in interpretability outcomes that may restrict the applications in drug discovery. In clinical trials, absolute and methodological data must be generated to tackle many puzzles in validating ML techniques, improving decision-making, promoting awareness in ML approaches, and deducing risk failures in drug discovery.}, bibtype = {book}, author = {Dara, Suresh and Dhamercherla, Swetha and Jadav, Surender Singh and Babu, Ch Madhu and Ahsan, Mohamed Jawed}, doi = {10.1007/s10462-021-10058-4} }
@article{ title = {3D CAD model retrieval based on sketch and unsupervised variational autoencoder}, type = {article}, year = {2022}, keywords = {Deep learning,Model retrieval,Sketch,Structural semantics,Unsupervised learning}, pages = {101427}, volume = {51}, websites = {https://doi.org/10.1016/j.aei.2021.101427}, publisher = {Elsevier Ltd}, id = {2a058129-e787-38c5-b3b9-18766e8711ea}, created = {2022-07-22T12:20:03.047Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-22T12:20:17.324Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {How to quickly, accurately retrieve and effectively reuse 3D CAD models that conform to user's design intention has become an urgent problem in product design. However, there are several problems with the existing retrieval methods, like not being fast, or accurate, or hard to use. Hence it is difficult to meet the actual needs of the industry. In this paper, we propose a 3D CAD model retrieval approach that considers the speed, accuracy and ease of use at the same time, based on sketches and unsupervised learning. Firstly, the loop is used as the fundamental element of sketch/view, and automatic structural semantics capture algorithms are proposed to extract and construct attributed loop relation tree; Secondly, the recursive neural network based deep variational autoencoders is constructed and optimized to transform arbitrary shapes and sizes of loop relation tree into fixed length descriptor; Finally, based on the fixed length vector descriptor, the sketches and views of 3D CAD models are embedded into the same target feature space, and k-nearest neighbors algorithm is adopted to conduct fast CAD model matching on the feature space. In this manner, a prototype 3D CAD model retrieval system is developed. Experiments on the dataset containing about two thousand 3D CAD models validate the feasibility and effectiveness of the proposed approach.}, bibtype = {article}, author = {Qin, Feiwei and Qiu, Shi and Gao, Shuming and Bai, Jing}, doi = {10.1016/j.aei.2021.101427}, journal = {Advanced Engineering Informatics}, number = {August 2021} }
@article{ title = {Rotation-Invariant Point Cloud Representation for 3-D Model Recognition}, type = {article}, year = {2022}, keywords = {3-D point cloud,Convolutional neural networks,Data models,Group theory,Harmonic analysis,Point cloud compression,Robot kinematics,Solid modeling,Task analysis,rotation invariant,three-dimensional (3-D) model recognition}, pages = {1-9}, publisher = {IEEE}, id = {52a3dcdf-e7e1-3442-b996-cb7aa4528d17}, created = {2022-07-28T12:39:24.664Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:41.938Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {Three-dimensional (3-D) data have many applications in the field of computer vision and a point cloud is one of the most popular modalities. Therefore, how to establish a good representation for a point cloud is a core issue in computer vision, especially for 3-D object recognition tasks. Existing approaches mainly focus on the invariance of representation under the group of permutations. However, for point cloud data, it should also be rotation invariant. To address such invariance, in this article, we introduce a relation of equivalence under the action of rotation group, through which the representation of point cloud is located in a homogeneous space. That is, two point clouds are regarded as equivalent when they are only different from a rotation. Our network is flexibly incorporated into existing frameworks for point clouds, which guarantees the proposed approach to be rotation invariant. Besides, a sufficient analysis on how to parameterize the group SO(3) into a convolutional network, which captures a relation with all rotations in 3-D Euclidean space R³. We select the optimal rotation as the best representation of point cloud and propose a solution for minimizing the problem on the rotation group SO(3) by using its geometric structure. To validate the rotation invariance, we combine it with two existing deep models and evaluate them on ModelNet40 dataset and its subset ModelNet10. Experimental results indicate that the proposed strategy improves the performance of those existing deep models when the data involve arbitrary rotations.}, bibtype = {article}, author = {Wang, Yan and Zhao, Yining and Ying, Shihui and Du, Shaoyi and Gao, Yue}, doi = {10.1109/TCYB.2022.3157593}, journal = {IEEE Transactions on Cybernetics} }
@article{ title = {Rotation invariant point cloud analysis: Where local geometry meets global topology}, type = {article}, year = {2022}, keywords = {Classification,Deep learning,Point cloud analysis,Rotation invariance,Segmentation}, pages = {108626}, volume = {127}, websites = {https://doi.org/10.1016/j.patcog.2022.108626}, publisher = {Elsevier Ltd}, id = {b0417028-8e58-3572-90e5-b6604cf6d59b}, created = {2022-07-28T12:39:24.696Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:30.952Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {Point cloud analysis is a fundamental task in 3D computer vision. Most previous works have conducted experiments on synthetic datasets with well-aligned data; while real-world point clouds are often not pre-aligned. How to achieve rotation invariance remains an open problem in point cloud analysis. To meet this challenge, we propose an approach toward achieving rotation-invariant (RI) representations by combining local geometry with global topology. In our local-global-representation (LGR)-Net, we have designed a two-branch network where one stream encodes local geometric RI features and the other encodes global topology-preserving RI features. Motivated by the observation that local geometry and global topology have different yet complementary RI responses in varying regions, two-branch RI features are fused by an innovative multi-layer perceptron (MLP) based attention module. To the best of our knowledge, this work is the first principled approach toward adaptively combining global and local information under the context of RI point cloud analysis. Extensive experiments have demonstrated that our LGR-Net achieves the state-of-the-art performance on various rotation-augmented versions of ModelNet40, ShapeNet, ScanObjectNN, and S3DIS.}, bibtype = {article}, author = {Zhao, Chen and Yang, Jiaqi and Xiong, Xin and Zhu, Angfan and Cao, Zhiguo and Li, Xin}, doi = {10.1016/j.patcog.2022.108626}, journal = {Pattern Recognition} }
@article{ title = {AGNet: An Attention-Based Graph Network for Point Cloud Classification and Segmentation}, type = {article}, year = {2022}, keywords = {3D point clouds,Geometric features,Graph attention mechanism,Neural network,Shape analysis}, pages = {1-18}, volume = {14}, id = {1917ee21-b0d6-3556-ada4-6c4ebb122ad4}, created = {2022-08-18T10:53:48.603Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:08.564Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ed28e2a4-1a3c-4903-8492-0d356dde91f1,244f8db2-6bd4-47d9-8abf-425a263fd4d1}, private_publication = {false}, abstract = {Classification and segmentation of point clouds have attracted increasing attention in recent years. On the one hand, it is difficult to extract local features with geometric information. On the other hand, how to select more important features correctly also brings challenges to the research. Therefore, the main challenge in classifying and segmenting the point clouds is how to locate the attentional region. To tackle this challenge, we propose a graph-based neural network with an attention pooling strategy (AGNet). In particular, local feature information can be extracted by constructing a topological structure. Compared to existing methods, AGNet can better extract the spatial information with different distances, and the attentional pooling strategy is capable of selecting the most important features of the topological structure. Therefore, our model can aggregate more information to better represent different point cloud features. We conducted extensive experiments on challenging benchmark datasets including ModelNet40 for object classification, as well as ShapeNet Part and S3DIS for segmentation. Both the quantitative and qualitative experiments demonstrated a consistent advantage for the tasks of point set classification and segmentation.}, bibtype = {article}, author = {Jing, Weipeng and Zhang, Wenjun and Li, Linhui and Di, Donglin and Chen, Guangsheng and Wang, Jian}, doi = {10.3390/rs14041036}, journal = {Remote Sensing}, number = {4} }
@article{ title = {Enhancing Local Feature Learning Using Diffusion for 3D Point Cloud Understanding}, type = {article}, year = {2022}, websites = {http://arxiv.org/abs/2207.01174}, id = {153d090e-04a7-36bb-9386-4d3e5be12c16}, created = {2022-08-18T10:53:48.905Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:09.149Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Learning point clouds is challenging due to the lack of connectivity information, i.e., edges. Although existing edge-aware methods can improve the performance by modeling edges, how edges contribute to the improvement is unclear. In this study, we propose a method that automatically learns to enhance/suppress edges while keeping the its working mechanism clear. First, we theoretically figure out how edge enhancement/suppression works. Second, we experimentally verify the edge enhancement/suppression behavior. Third, we empirically show that this behavior improves performance. In general, we observe that the proposed method achieves competitive performance in point cloud classification and segmentation tasks.}, bibtype = {article}, author = {Xiu, Haoyi and Liu, Xin and Wang, Weimin and Kim, Kyoung-Sook and Shinohara, Takayuki and Chang, Qiong and Matsuoka, Masashi} }
@article{ title = {Point3D: tracking actions as moving points with 3D CNNs}, type = {article}, year = {2022}, pages = {1-14}, websites = {http://arxiv.org/abs/2203.10584}, id = {a85ba577-590d-38ac-95c5-8f0302f58651}, created = {2022-08-18T10:53:48.940Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:11.420Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Spatio-temporal action recognition has been a challenging task that involves detecting where and when actions occur. Current state-of-the-art action detectors are mostly anchor-based, requiring sensitive anchor designs and huge computations due to calculating large numbers of anchor boxes. Motivated by nascent anchor-free approaches, we propose Point3D, a flexible and computationally efficient network with high precision for spatio-temporal action recognition. Our Point3D consists of a Point Head for action localization and a 3D Head for action classification. Firstly, Point Head is used to track center points and knot key points of humans to localize the bounding box of an action. These location features are then piped into a time-wise attention to learn long-range dependencies across frames. The 3D Head is later deployed for the final action classification. Our Point3D achieves state-of-the-art performance on the JHMDB, UCF101-24, and AVA benchmarks in terms of frame-mAP and video-mAP. Comprehensive ablation studies also demonstrate the effectiveness of each module proposed in our Point3D.}, bibtype = {article}, author = {Mo, Shentong and Xia, Jingfei and Tan, Xiaoqing and Raj, Bhiksha} }
@article{ title = {LPF-Defense: 3D Adversarial Defense based on Frequency Analysis}, type = {article}, year = {2022}, pages = {1-19}, websites = {http://arxiv.org/abs/2202.11287}, id = {7c999778-74c5-32eb-a8ca-b59c9be711e2}, created = {2022-08-29T14:18:24.788Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:56.349Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Naderi2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d,ed28e2a4-1a3c-4903-8492-0d356dde91f1,244f8db2-6bd4-47d9-8abf-425a263fd4d1,8c0c38c1-fb54-417d-a224-97fc3b1afba9}, private_publication = {false}, abstract = {Although 3D point cloud classification has recently been widely deployed in different application scenarios, it is still very vulnerable to adversarial attacks. This increases the importance of robust training of 3D models in the face of adversarial attacks. Based on our analysis on the performance of existing adversarial attacks, more adversarial perturbations are found in the mid and high-frequency components of input data. Therefore, by suppressing the high-frequency content in the training phase, the models robustness against adversarial examples is improved. Experiments showed that the proposed defense method decreases the success rate of six attacks on PointNet, PointNet++ ,, and DGCNN models. In particular, improvements are achieved with an average increase of classification accuracy by 3.8 % on drop100 attack and 4.26 % on drop200 attack compared to the state-of-the-art methods. The method also improves models accuracy on the original dataset compared to other available methods.}, bibtype = {article}, author = {Naderi, Hanieh and Etemadi, Arian and Noorbakhsh, Kimia and Kasaei, Shohreh} }
@article{ title = {NormalAttack : Curvature-Aware Shape Deformation along Normals for Imperceptible Point Cloud Attack}, type = {article}, year = {2022}, volume = {2022}, id = {b5e88401-a7d8-3857-ab33-03c70da738ee}, created = {2022-08-29T14:18:24.794Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-12-05T14:13:52.483Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {db36ed60-3b58-424a-b9a4-a9c7322975f3,8c0c38c1-fb54-417d-a224-97fc3b1afba9}, private_publication = {false}, bibtype = {article}, author = {Tang, Keke and Shi, Yawen and Wu, Jianpeng and Peng, Weilong and Khan, Asad and Zhu, Peican and Gu, Zhaoquan}, journal = {Security and Communication Networks} }
@article{ title = {Point Cloud Attacks in Graph Spectral Domain: When 3D Geometry Meets Graph Signal Processing}, type = {article}, year = {2022}, pages = {1-15}, volume = {14}, websites = {http://arxiv.org/abs/2207.13326}, id = {c85aeb37-a718-321d-a3e8-3f172ae14d5e}, created = {2022-08-29T14:18:24.873Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:11.008Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8f8f5505-1a28-42dd-a82c-92b5738465f1,ed28e2a4-1a3c-4903-8492-0d356dde91f1,244f8db2-6bd4-47d9-8abf-425a263fd4d1,8c0c38c1-fb54-417d-a224-97fc3b1afba9}, private_publication = {false}, abstract = {With the increasing attention in various 3D safety-critical applications, point cloud learning models have been shown to be vulnerable to adversarial attacks. Although existing 3D attack methods achieve high success rates, they delve into the data space with point-wise perturbation, which may neglect the geometric characteristics. Instead, we propose point cloud attacks from a new perspective -- the graph spectral domain attack, aiming to perturb graph transform coefficients in the spectral domain that corresponds to varying certain geometric structure. Specifically, leveraging on graph signal processing, we first adaptively transform the coordinates of points onto the spectral domain via graph Fourier transform (GFT) for compact representation. Then, we analyze the influence of different spectral bands on the geometric structure, based on which we propose to perturb the GFT coefficients via a learnable graph spectral filter. Considering the low-frequency components mainly contribute to the rough shape of the 3D object, we further introduce a low-frequency constraint to limit perturbations within imperceptible high-frequency components. Finally, the adversarial point cloud is generated by transforming the perturbed spectral representation back to the data domain via the inverse GFT. Experimental results demonstrate the effectiveness of the proposed attack in terms of both the imperceptibility and attack success rates.}, bibtype = {article}, author = {Liu, Daizong and Hu, Wei and Li, Xin}, number = {8} }
@article{ title = {Geometry-Aware Generation of Adversarial Point Clouds}, type = {article}, year = {2022}, pages = {2984-2999}, volume = {44}, publisher = {IEEE}, id = {e716209c-61e4-3a3d-9639-e873ebe3e1c1}, created = {2022-09-01T14:14:15.402Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-01T14:14:28.435Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Wen, Yuxin and Lin, Jiehong and Chen, Ke and Chen, C L Philip and Jia, Kui}, number = {6} }
@article{ title = {ART-Point : Improving Rotation Robustness of Point Cloud Classifiers via Adversarial Rotation}, type = {article}, year = {2022}, pages = {14371-14380}, id = {b6832215-ca93-34aa-a2db-e7a42d4a253e}, created = {2022-09-01T14:14:15.404Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:04.100Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {db36ed60-3b58-424a-b9a4-a9c7322975f3}, private_publication = {false}, bibtype = {article}, author = {Wang, Ruibin and Yang, Yibo and Tao, Dacheng} }
@article{ title = {SCONE : Surface Coverage Optimization in Unknown Environments by Volumetric Integration}, type = {article}, year = {2022}, pages = {1-24}, id = {18e7f89a-67ab-3d2c-9e94-93b86fc79929}, created = {2022-09-06T11:37:21.049Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:56.546Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Guedon2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d,2a64087a-cd32-494a-8140-2abf0b1356c6,40587c46-f934-43d3-a07b-43b15ce182ed}, private_publication = {false}, bibtype = {article}, author = {Guédon, Antoine and Ponts, Ecole} }
@article{ title = {Boosting 3D Adversarial Attacks With Attacking on Frequency}, type = {article}, year = {2022}, pages = {50974-50984}, volume = {10}, publisher = {IEEE}, id = {bac6be16-2ba7-3d69-8bc2-1dd7b7da7131}, created = {2022-09-06T14:02:41.606Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:07.708Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {db36ed60-3b58-424a-b9a4-a9c7322975f3,8c0c38c1-fb54-417d-a224-97fc3b1afba9}, private_publication = {false}, bibtype = {article}, author = {Liu, Binbin and Zhang, Jinlai and Zhu, Jihong}, doi = {10.1109/ACCESS.2022.3171659}, journal = {IEEE Access} }
@article{ title = {Comprehensive Review of Deep Learning-Based 3D Point Cloud Completion Processing and Analysis}, type = {article}, year = {2022}, pages = {1-22}, id = {a06ac853-087d-3f56-80f6-099da9844268}, created = {2022-09-13T13:57:40.428Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-26T08:55:23.175Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f4318b9a-a9ed-4f3f-8c48-09937099013b}, private_publication = {false}, abstract = {Point cloud completion is a generation and estimation issue derived from the partial point clouds, which plays a vital role in the applications in 3D computer vision. The progress of deep learning (DL) has impressively improved the capability and robustness of point cloud completion. However, the quality of completed point clouds is still needed to be further enhanced to meet the practical utilization. Therefore, this work aims to conduct a comprehensive survey on various methods, including point-based, convolution-based, graph-based, and generative model-based approaches, etc. And this survey summarizes the comparisons among these methods to provoke further research insights. Besides, this review sums up the commonly used datasets and illustrates the applications of point cloud completion. Eventually, we also discussed possible research trends in this promptly expanding field.}, bibtype = {article}, author = {Fei, Ben and Yang, Weidong and Chen, Wen-Ming and Li, Zhijun and Li, Yikang and Ma, Tao and Hu, Xing and Ma, Lipeng}, doi = {10.1109/tits.2022.3195555}, journal = {IEEE Transactions on Intelligent Transportation Systems} }
@article{ title = {PUFA-GAN: A Frequency-Aware Generative Adversarial Network for 3D Point Cloud Upsampling}, type = {article}, year = {2022}, pages = {1-13}, websites = {http://arxiv.org/abs/2203.00914}, id = {63dd1f06-4b7e-3d34-807d-d4fcdbd3b90a}, created = {2022-09-13T13:57:40.429Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-19T07:34:03.912Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f4318b9a-a9ed-4f3f-8c48-09937099013b,b6d75013-efe2-4ddc-b3db-65496bd4db9f}, private_publication = {false}, abstract = {We propose a generative adversarial network for point cloud upsampling, which can not only make the upsampled points evenly distributed on the underlying surface but also efficiently generate clean high frequency regions. The generator of our network includes a dynamic graph hierarchical residual aggregation unit and a hierarchical residual aggregation unit for point feature extraction and upsampling, respectively. The former extracts multiscale point-wise descriptive features, while the latter captures rich feature details with hierarchical residuals. To generate neat edges, our discriminator uses a graph filter to extract and retain high frequency points. The generated high resolution point cloud and corresponding high frequency points help the discriminator learn the global and high frequency properties of the point cloud. We also propose an identity distribution loss function to make sure that the upsampled points remain on the underlying surface of the input low resolution point cloud. To assess the regularity of the upsampled points in high frequency regions, we introduce two evaluation metrics. Objective and subjective results demonstrate that the visual quality of the upsampled point clouds generated by our method is better than that of the state-of-the-art methods.}, bibtype = {article}, author = {Liu, Hao and Yuan, Hui and Hou, Junhui and Hamzaoui, Raouf and Gao, Wei} }
@article{ title = {PU-REFINER : A GEOMETRY REFINER WITH ADVERSARIAL LEARNING FOR POINT CLOUD UPSAMPLING 2 . School of Control Science and Engineering , Shandong University , Jinan , China , 3 . School of Engineering and Sustainable Development , De Montfort University , Lei}, type = {article}, year = {2022}, pages = {2270-2274}, publisher = {IEEE}, id = {6b8b41dd-12b5-3f22-8ec5-7cc4295bce61}, created = {2022-09-19T07:34:03.592Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-19T07:34:15.018Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f4318b9a-a9ed-4f3f-8c48-09937099013b,b6d75013-efe2-4ddc-b3db-65496bd4db9f}, private_publication = {false}, bibtype = {article}, author = {Liu, Hao and Yuan, Hui and Hamzaoui, Raouf and Gao, Wei and Li, Shuai} }
@article{ title = {Geometry-Aware Generation of Adversarial Point Clouds}, type = {article}, year = {2022}, keywords = {Adversarial example,object surface geometry,point cloud}, pages = {2984-2999}, volume = {44}, id = {08e418b9-ab8c-3a58-a370-77839efc1d9f}, created = {2022-10-10T13:41:15.050Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-12T07:53:07.477Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8410cff5-f764-42b2-a9b5-ead8d2dee5c8,8c0c38c1-fb54-417d-a224-97fc3b1afba9}, private_publication = {false}, abstract = {Machine learning models have been shown to be vulnerable to adversarial examples. While most of the existing methods for adversarial attack and defense work on the 2D image domain, a few recent attempts have been made to extend them to 3D point cloud data. However, adversarial results obtained by these methods typically contain point outliers, which are both noticeable and easy to defend against using the simple techniques of outlier removal. Motivated by the different mechanisms by which humans perceive 2D images and 3D shapes, in this paper we propose the new design of geometry-aware objectives, whose solutions favor (the discrete versions of) the desired surface properties of smoothness and fairness. To generate adversarial point clouds, we use a targeted attack misclassification loss that supports continuous pursuit of increasingly malicious signals. Regularizing the targeted attack loss with our proposed geometry-aware objectives results in our proposed method, Geometry-Aware Adversarial Attack (GeoA3GeoA3). The results of GeoA3GeoA3 tend to be more harmful, arguably harder to defend against, and of the key adversarial characterization of being imperceptible to humans. While the main focus of this paper is to learn to generate adversarial point clouds, we also present a simple but effective algorithm termed Geo+A3Geo+A3-IterNormPro, with Iterative Normal Projection (IterNorPro) that solves a new objective function Geo+A3Geo+A3, towards surface-level adversarial attacks via generation of adversarial point clouds. We quantitatively evaluate our methods on both synthetic and physical objects in terms of attack success rate and geometric regularity. For a qualitative evaluation, we conduct subjective studies by collecting human preferences from Amazon Mechanical Turk. Comparative results in comprehensive experiments confirm the advantages of our proposed methods. Our source codes are publicly available at https://github.com/Yuxin-Wen/GeoA3.}, bibtype = {article}, author = {Wen, Yuxin and Lin, Jiehong and Chen, Ke and Chen, C. L.Philip and Jia, Kui}, doi = {10.1109/TPAMI.2020.3044712}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {6} }
@article{ title = {Robust Object Classification Approach Using Spherical Harmonics}, type = {article}, year = {2022}, keywords = {Object recognition,point cloud classification,robust classification,spherical harmonics}, pages = {21541-21553}, volume = {10}, publisher = {IEEE}, id = {167d8e3b-4b47-3207-9c15-b458b3231c03}, created = {2023-04-24T07:38:01.276Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:54.977Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mukhaimar2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {Point clouds produced by either 3D scanners or multi-view images are often imperfect and contain noise or outliers. This paper presents an end-to-end robust spherical harmonics approach to classifying 3D objects. The proposed framework first uses the voxel grid of concentric spheres to learn features over the unit ball. We then limit the spherical harmonics order level to suppress the effect of noise and outliers. In addition, the entire classification operation is performed in the Fourier domain. As a result, our proposed model learned features that are less sensitive to data perturbations and corruptions. We tested our proposed model against several types of data perturbations and corruptions, such as noise and outliers. Our results show that the proposed model has fewer parameters, competes with state-of-art networks in terms of robustness to data inaccuracies, and is faster than other robust methods. Our implementation code is also publicly available at https://github.com/AymanMukh/R-SCNN}, bibtype = {article}, author = {Mukhaimar, Ayman and Tennakoon, Ruwan and Lai, Chow Yin and Hoseinnezhad, Reza and Bab-Hadiashar, Alireza}, doi = {10.1109/ACCESS.2022.3151350}, journal = {IEEE Access} }
@article{ title = {Rotation-Invariant Point Cloud Representation for 3-D Model Recognition}, type = {article}, year = {2022}, keywords = {3-D point cloud,Group theory,rotation invariant,three-dimensional (3-D) model recognition}, pages = {10948-10956}, volume = {52}, publisher = {IEEE}, id = {097a15cb-e368-3d47-9484-63a1f1cd03de}, created = {2023-05-03T13:16:39.125Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.228Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Three-dimensional (3-D) data have many applications in the field of computer vision and a point cloud is one of the most popular modalities. Therefore, how to establish a good representation for a point cloud is a core issue in computer vision, especially for 3-D object recognition tasks. Existing approaches mainly focus on the invariance of representation under the group of permutations. However, for point cloud data, it should also be rotation invariant. To address such invariance, in this article, we introduce a relation of equivalence under the action of rotation group, through which the representation of point cloud is located in a homogeneous space. That is, two point clouds are regarded as equivalent when they are only different from a rotation. Our network is flexibly incorporated into existing frameworks for point clouds, which guarantees the proposed approach to be rotation invariant. Besides, a sufficient analysis on how to parameterize the group SO(3) into a convolutional network, which captures a relation with all rotations in 3-D Euclidean space R3. We select the optimal rotation as the best representation of point cloud and propose a solution for minimizing the problem on the rotation group SO(3) by using its geometric structure. To validate the rotation invariance, we combine it with two existing deep models and evaluate them on ModelNet40 dataset and its subset ModelNet10. Experimental results indicate that the proposed strategy improves the performance of those existing deep models when the data involve arbitrary rotations.}, bibtype = {article}, author = {Wang, Yan and Zhao, Yining and Ying, Shihui and Du, Shaoyi and Gao, Yue}, doi = {10.1109/TCYB.2022.3157593}, journal = {IEEE Transactions on Cybernetics}, number = {10} }
@article{ title = {Z2P: Instant Visualization of Point Clouds}, type = {article}, year = {2022}, pages = {461-471}, volume = {41}, id = {e5bdbade-8158-3922-aa66-ff7167b0b785}, created = {2023-05-03T13:16:39.335Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.544Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Metzer2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present a technique for visualizing point clouds using a neural network. Our technique allows for an instant preview of any point cloud, and bypasses the notoriously difficult surface reconstruction problem or the need to estimate oriented normals for splat-based rendering. We cast the preview problem as a conditional image-to-image translation task, and design a neural network that translates point depth-map directly into an image, where the point cloud is visualized as though a surface was reconstructed from it. Furthermore, the resulting appearance of the visualized point cloud can be, optionally, conditioned on simple control variables (e.g., color and light). We demonstrate that our technique instantly produces plausible images, and can, on-the-fly effectively handle noise, non-uniform sampling, and thin surfaces sheets.}, bibtype = {article}, author = {Metzer, G. and Hanocka, R. and Giryes, R. and Mitra, N. J. and Cohen-Or, D.}, doi = {10.1111/cgf.14487}, journal = {Computer Graphics Forum}, number = {2} }
@article{ title = {Using Spherical Harmonics for Navigating in Dynamic and Uncertain Environments}, type = {article}, year = {2022}, keywords = {Collision Avoidance,Path Planning,Spherical Harmonics}, pages = {567-572}, volume = {55}, websites = {https://doi.org/10.1016/j.ifacol.2022.11.243}, publisher = {Elsevier Ltd}, id = {964195eb-adfe-3d05-84bd-5ed94c9e37ef}, created = {2023-05-03T13:16:39.459Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.720Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Patrick2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In this paper, we present a novel approach for local motion planning in an unknown environment populated by static and dynamic obstacles. A key component of our planner is the use of harmonic basis functions defined over the unit sphere, known as spherical harmonics (SH), to represent a collision-free space near an agent. In contrast with our previous approach, which only applies to unknown static environments, the approach proposed herein handles dynamic obstacles as well. It is done by efficiently calculating the coefficients of a spherical harmonics approximation of the local collision-free space for each step of a given planning horizon. Our method for approximating the collision-free space with spherical harmonics allows us to plan trajectories in challenging environments (e.g., planning through narrow passages) where other methods for generating search-spaces fail. To accurately approximate the collision-free space for future time steps along a planning horizon, we use a Kalman Filter (KF) to propagate the measured point cloud through time. Our use of the KF allows us to better approximate obstacles moving in space in comparison to inaccurate constant obstacle velocity assumptions used by other methods. Finally, we show the effectiveness of our proposed trajectory planner using two different non-trivial scenarios.}, bibtype = {article}, author = {Patrick, Steven D. and Bakolas, Efstathios}, doi = {10.1016/j.ifacol.2022.11.243}, journal = {IFAC-PapersOnLine}, number = {37} }
@article{ title = {Equivalence Between SE(3) Equivariant Networks via Steerable Kernels and Group Convolution}, type = {article}, year = {2022}, pages = {1-23}, websites = {http://arxiv.org/abs/2211.15903}, id = {498d0053-3454-37ed-9bfa-d0916f795def}, created = {2023-05-03T13:16:39.505Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.901Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Poulenard2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {A wide range of techniques have been proposed in recent years for designing neural networks for 3D data that are equivariant under rotation and translation of the input. Most approaches for equivariance under the Euclidean group $\mathrmSE(3)$ of rotations and translations fall within one of the two major categories. The first category consists of methods that use $\mathrmSE(3)$-convolution which generalizes classical $\mathbbR^3$-convolution on signals over $\mathrmSE(3)$. Alternatively, it is possible to use \textitsteerable convolution which achieves $\mathrmSE(3)$-equivariance by imposing constraints on $\mathbbR^3$-convolution of tensor fields. It is known by specialists in the field that the two approaches are equivalent, with steerable convolution being the Fourier transform of $\mathrmSE(3)$ convolution. Unfortunately, these results are not widely known and moreover the exact relations between deep learning architectures built upon these two approaches have not been precisely described in the literature on equivariant deep learning. In this work we provide an in-depth analysis of both methods and their equivalence and relate the two constructions to multiview convolutional networks. Furthermore, we provide theoretical justifications of separability of $\mathrmSE(3)$ group convolution, which explain the applicability and success of some recent approaches. Finally, we express different methods using a single coherent formalism and provide explicit formulas that relate the kernels learned by different methods. In this way, our work helps to unify different previously-proposed techniques for achieving roto-translational equivariance, and helps to shed light on both the utility and precise differences between various alternatives. We also derive new TFN non-linearities from our equivalence principle and test them on practical benchmark datasets.}, bibtype = {article}, author = {Poulenard, Adrien and Ovsjanikov, Maks and Guibas, Leonidas J.}, number = {3} }
@article{ title = {Solid waste shape description and generation based on spherical harmonics and probability density function}, type = {article}, year = {2022}, keywords = {accepted 9th august 2021,bezama,by associate editor alberto,dem,probability function,received 18th february 2021,shape description,shape generation,solid waste particle,spherical harmonics}, id = {338d6a9c-ff7f-34cd-a87f-9b1db9b6507e}, created = {2023-05-03T13:16:39.646Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.721Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Li2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, bibtype = {article}, author = {Li, Yifeng and Qin, Xunpeng and Zhang, Zhenyuan and Dong, Huanyu}, doi = {10.1177/0734242X211045003} }
@article{ title = {PSE-Match: A Viewpoint-Free Place Recognition Method With Parallel Semantic Embedding}, type = {article}, year = {2022}, keywords = {3D place recognition,divergence learning,global localization,place feature learning,semantic embedding}, pages = {11249-11260}, volume = {23}, publisher = {IEEE}, id = {ec37ddb7-e054-30e1-819e-dfa56d29fb92}, created = {2023-05-03T13:16:39.811Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.192Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yin2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Accurate localization on the autonomous driving cars is essential for autonomy and driving safety, especially for complex urban streets and search-and-rescue subterranean environments where high-accurate GPS is not available. However current odometry estimation may introduce the drifting problems in long-term navigation without robust global localization. The main challenges involve scene divergence under the interference of dynamic environments and effective perception of observation and object layout variance from different viewpoints. To tackle these challenges, we present PSE-Match, a viewpoint-free place recognition method based on parallel semantic analysis of isolated semantic attributes from 3D point-cloud models. Compared with the original point cloud, the observed variance of semantic attributes is smaller. PSE-Match incorporates a divergence place learning network to capture different semantic attributes parallelly through the spherical harmonics domain. Using both existing benchmark datasets and two in-field collected datasets, our experiments show that the proposed method achieves above 70% average recall with top one retrieval and above 95% average recall with top ten retrieval cases. And PSE-Match has also demonstrated an obvious generalization ability with limited training dataset.}, bibtype = {article}, author = {Yin, Peng and Xu, Lingyun and Feng, Ziyue and Egorov, Anton and Li, Bing}, doi = {10.1109/TITS.2021.3102429}, journal = {IEEE Transactions on Intelligent Transportation Systems}, number = {8} }
@article{ title = {Spherical harmonics to quantify cranial asymmetry in deformational plagiocephaly}, type = {article}, year = {2022}, pages = {1-10}, volume = {12}, websites = {https://doi.org/10.1038/s41598-021-04181-z}, publisher = {Nature Publishing Group UK}, id = {62fd082a-1da8-3167-b5ce-bebed3dec3b4}, created = {2023-05-03T13:16:39.843Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.858Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Grieb2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Cranial deformation and deformational plagiocephaly (DP) in particular affect an important percentage of infants. The assessment and diagnosis of the deformation are commonly carried by manual measurements that provide low interuser accuracy. Another approach is the use of three-dimensional (3D) models. Nevertheless, in most cases, deformation measurements are carried out manually on the 3D model. It is necessary to develop methodologies for the detection of DP that are automatic, accurate and take profit on the high quantity of information of the 3D models. Spherical harmonics are proposed as a new methodology to identify DP from head 3D models. The ideal fitted ellipsoid for each head is computed and the orthogonal distances between head and ellipsoid are obtained. Finally, the distances are modelled using spherical harmonics. Spherical harmonic coefficients of degree 2 and order − 2 are identified as the correct ones to represent the asymmetry characteristic of DP. The obtained coefficient is compared to other anthropometric deformation indexes, such as Asymmetry Index, Oblique Cranial Length Ratio, Posterior Asymmetry Index and Anterior Asymmetry Index. The coefficient of degree 2 and order − 2 with a maximum degree of 4 is found to provide better results than the commonly computed anthropometric indexes in the detection of DP.}, bibtype = {article}, author = {Grieb, Jonas and Barbero-García, Inés and Lerma, José Luis}, doi = {10.1038/s41598-021-04181-z}, journal = {Scientific Reports}, number = {1} }
@article{ title = {NPBG++: Accelerating Neural Point-Based Graphics}, type = {article}, year = {2022}, keywords = {3D from multi-view and sensors,Deep learning architectures and techniques,Image and video synthesis and generation,Machine learning,Representation learning,Scene analysis and understanding}, pages = {15948-15958}, volume = {2022-June}, id = {63f9f918-2d68-376d-a9f2-119e312b86d9}, created = {2023-05-03T13:16:39.849Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.102Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rakhimov2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present a new system $(NPBG++)$ for the novel view synthesis (NVS) task that achieves high rendering realism with low scene fitting time. Our method efficiently lever-ages the multiview observations and the point cloud of a static scene to predict a neural descriptor for each point, improving upon the pipeline of Neural Point-Based Graph-ics [1] in several important ways. By predicting the descrip-tors with a single pass through the source images, we lift the requirement of per-scene optimization while also making the neural descriptors view-dependent and more suit-able for scenes with strong non-Lambertian effects. In our comparisons, the proposed system outperforms previous NVS approaches in terms of fitting and rendering runtimes while producing images of similar quality. Project page: https://rakhimovv.github.io/npbgpp/.}, bibtype = {article}, author = {Rakhimov, Ruslan and Ardelean, Andrei Timotei and Lempitsky, Victor and Burnaev, Evgeny}, doi = {10.1109/CVPR52688.2022.01550}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Equivariant Point Cloud Analysis via Learning Orientations for Message Passing}, type = {article}, year = {2022}, keywords = {Pose estimation and tracking,Scene analysis and understanding}, pages = {18910-18919}, volume = {2022-June}, id = {3184c9cf-2c09-3926-9ebe-615b1bafdff0}, created = {2023-05-03T13:16:40.002Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.311Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Luo2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Equivariance has been a long-standing concern in various fields ranging from computer vision to physical modeling. Most previous methods struggle with generality, simplicity, and expressiveness - some are designed ad hoc for specific data types, some are too complex to be accessible, and some sacrifice flexible transformations. In this work, we propose a novel and simple framework to achieve equivariance for point cloud analysis based on the message passing (graph neural network) scheme. We find the equivariant property could be obtained by introducing an orientation for each point to decouple the relative position for each point from the global pose of the entire point cloud. Therefore, we extend current message passing networks with a module that learns orientations for each point. Before aggregating information from the neighbors of a point, the networks transforms the neighbors' coordinates based on the point's learned orientations. We provide formal proofs to show the equivariance of the proposed framework. Empirically, we demonstrate that our proposed method is competitive on both point cloud analysis and physical modeling tasks. Code is available at https://github.com/luost26/Equivariant-OrientedMP.}, bibtype = {article}, author = {Luo, Shitong and Li, Jiahan and Guan, Jiaqi and Su, Yufeng and Cheng, Chaoran and Peng, Jian and Ma, Jianzhu}, doi = {10.1109/CVPR52688.2022.01836}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@book{ title = {Differentiable Point-Based Radiance Fields for Efficient View Synthesis}, type = {book}, year = {2022}, source = {Proceedings - SIGGRAPH Asia 2022 Conference Papers}, keywords = {Image-based Rendering,Neural Rendering,Novel View Synthesis}, volume = {41}, issue = {4}, publisher = {Association for Computing Machinery}, id = {d2e9c6df-4f0d-38ee-8689-1f7ebe8e7cd1}, created = {2023-05-03T13:16:40.146Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.937Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhang2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We propose a differentiable rendering algorithm for efficient novel view synthesis. By departing from volume-based representations in favor of a learned point representation, we improve on existing methods more than an order of magnitude in memory and runtime, both in training and inference. The method begins with a uniformly-sampled random point cloud and learns per-point position and view-dependent appearance, using a differentiable splat-based renderer to train the model to reproduce a set of input training images with the given pose. Our method is up to 300 × faster than NeRF in both training and inference, with only a marginal sacrifice in quality, while using less than 10 MB of memory for a static scene. For dynamic scenes, our method trains two orders of magnitude faster than STNeRF and renders at a near interactive rate, while maintaining high image quality and temporal coherence even without imposing any temporal-coherency regularizers.}, bibtype = {book}, author = {Zhang, Qiang and Baek, Seung Hwan and Rusinkiewicz, Szymon and Heide, Felix}, doi = {10.1145/3550469.3555413} }
@article{ title = {Fast Sequence-Matching Enhanced}, type = {article}, year = {2022}, pages = {2127-2135}, volume = {69}, publisher = {IEEE}, id = {6f3ac59d-a5f3-3e31-b370-0fb651da4b00}, created = {2023-05-03T13:16:40.155Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.887Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Recognition2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, bibtype = {article}, author = {Recognition, Viewpoint-invariant Place and Yin, Peng and Wang, Fuying and Egorov, Anton and Hou, Jiafan and Jia, Zhenzhong}, number = {2} }
@article{ title = {Surface Eigenvalues with Lattice-Based Approximation In comparison with analytical solution}, type = {article}, year = {2022}, pages = {1-28}, websites = {http://arxiv.org/abs/2203.03603}, id = {682e2e10-3322-3077-b4a9-7d6d352cb4bc}, created = {2023-05-03T13:16:40.694Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.267Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wu2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In this paper, we propose a meshless method of computing eigenvalues and eigenfunctions of a given surface embedded in $\mathbb R^3$. We use point cloud data as input and generate the lattice approximation for some neighborhood of the surface. We compute the eigenvalues and eigenvectors of the cubic lattice graph as an approximation of the eigenvalues and eigenfunctions of the Laplace-Beltrami operator on the surface. We perform extensive numerical experiments on surfaces with various topology and compare our computed eigenvalues from point cloud surface with exact solutions and standard finite element methods using triangle mesh.}, bibtype = {article}, author = {Wu, Yingying and Wu, Tianqi and Yau, Shing-Tung} }
@article{ title = {e3nn: Euclidean Neural Networks}, type = {article}, year = {2022}, pages = {1-22}, websites = {http://arxiv.org/abs/2207.09453}, id = {8a3cb5fc-c9d3-31d3-82d6-8f0c4f0325fe}, created = {2023-05-03T13:16:41.030Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.339Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Geiger2022}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present e3nn, a generalized framework for creating E(3) equivariant trainable functions, also known as Euclidean neural networks. e3nn naturally operates on geometry and geometric tensors that describe systems in 3D and transform predictably under a change of coordinate system. The core of e3nn are equivariant operations such as the TensorProduct class or the spherical harmonics functions that can be composed to create more complex modules such as convolutions and attention mechanisms. These core operations of e3nn can be used to efficiently articulate Tensor Field Networks, 3D Steerable CNNs, Clebsch-Gordan Networks, SE(3) Transformers and other E(3) equivariant networks.}, bibtype = {article}, author = {Geiger, Mario and Smidt, Tess}, number = {3} }
@article{ title = {SCONE: Surface Coverage Optimization in Unknown Environments by Volumetric Integration}, type = {article}, year = {2022}, websites = {http://arxiv.org/abs/2208.10449}, id = {3b8f6a29-94eb-3448-9e97-8c5b3504728a}, created = {2023-06-22T10:06:22.405Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-22T10:06:29.191Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {cbcad584-0c50-48fe-a0d7-5b4c781cba83}, private_publication = {false}, abstract = {Next Best View computation (NBV) is a long-standing problem in robotics, and consists in identifying the next most informative sensor position(s) for reconstructing a 3D object or scene efficiently and accurately. Like most current methods, we consider NBV prediction from a depth sensor like Lidar systems. Learning-based methods relying on a volumetric representation of the scene are suitable for path planning, but have lower accuracy than methods using a surface-based representation. However, the latter do not scale well with the size of the scene and constrain the camera to a small number of poses. To obtain the advantages of both representations, we show that we can maximize surface metrics by Monte Carlo integration over a volumetric representation. In particular, we propose an approach, SCONE, that relies on two neural modules: The first module predicts occupancy probability in the entire volume of the scene. Given any new camera pose, the second module samples points in the scene based on their occupancy probability and leverages a self-attention mechanism to predict the visibility of the samples. Finally, we integrate the visibility to evaluate the gain in surface coverage for the new camera pose. NBV is selected as the pose that maximizes the gain in total surface coverage. Our method scales to large scenes and handles free camera motion: It takes as input an arbitrarily large point cloud gathered by a depth sensor as well as camera poses to predict NBV. We demonstrate our approach on a novel dataset made of large and complex 3D scenes.}, bibtype = {article}, author = {Guédon, Antoine and Monasse, Pascal and Lepetit, Vincent}, number = {NeurIPS} }
@inproceedings{ title = {A Survey on Convolutional Neural Network Accelerators: GPU, FPGA and ASIC}, type = {inproceedings}, year = {2022}, keywords = {ASIC,FPGA,GPU,convolutional neural network,deep learning accelerator}, pages = {100-107}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, id = {8a31163f-710d-3cb0-9be5-a9f4c4b9f9f6}, created = {2023-11-07T09:44:22.169Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-02-06T09:03:05.021Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {In recent years, artificial intelligence (AI) has been under rapid development, applied in various areas. Among a vast number of neural network (NN) models, the convolutional neural network (CNN) has a mainstream status in application such as image and sound recognition and machine decision. The convolution operation is the most complex and requires acceleration. A practical method is to optimize the architecture of the deep learning processor (DLP). The traditional CPU architecture lacks parallelism and memory bandwidth and is not suitable for CNN operations. Current researches are focused on graphic processing unit (GPU), field programmable gate array (FPGA) and application specific integrated circuit (ASIC). GPU is the maturest and the most widely applied, however it is not flexible and has high cost and energy consumption. Even though FPGA possesses high flexibility and low energy consumption, it is inferior in performance. ASIC, due to targeted design, is advanced in performance and energy consumption. However, it is highly inflexible. This article reviews the research outcomes of the three classic types of processors applied to CNN, and put forward the future research trend. In particular, this paper analyzes and compares the experimental performance of several processors of different types, and then summarizes the respective advantageous application fields. Hence, the novelty of this article is in the summary of practical DLPs, which is expected to provide helps for the AI researchers, and guide the selection of CNN-supporting hardware in industrial application.}, bibtype = {inproceedings}, author = {Hu, Yunxiang and Liu, Yuhao and Liu, Zhuovuan}, doi = {10.1109/ICCRD54409.2022.9730377}, booktitle = {2022 IEEE 14th International Conference on Computer Research and Development, ICCRD 2022} }
@article{ title = {State of Art IoT and Edge Embedded Systems for Real-Time Machine Vision Applications}, type = {article}, year = {2022}, keywords = {ASIC,Edge machine vision systems,FPGA,GPU,IoT,Multicore CPU}, pages = {58287-58301}, volume = {10}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, id = {4b658f97-16cf-3e8f-a5ef-c5eaff9c42e0}, created = {2023-11-07T09:51:47.919Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-01-09T14:27:10.836Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {IoT and edge devices dedicated to run machine vision algorithms are usually few years lagging currently available state-of-the-art technologies for hardware accelerators. This is mainly due to the non-negligible time delay required to implement and assess related algorithms. Among possible hardware platforms which are potentially being explored to handle real-time machine vision tasks, multi-core CPU and Graphical Processing Unit (GPU) platforms remain the most widely used ones over Field Programmable Gate Array (FPGA) and Application Specific Integrated Circuit (ASIC)-based platforms. This is mainly due to the availability of powerful and user friendly software development tools, in addition to their lower cost, and obviously their high computation power with reasonable form factor and power consumption. Nevertheless, the trend now is towards a System-On-Chip (SOC) processors which combine ASIC/FPGA accelerators with GPU/multicore CPUs. This paper presents different state of the art IoT and edge machine vision technologies along with their performance and limitations. It can be a good reference for researchers involved in designing state of the art IoT embedded systems for machine vision applications.}, bibtype = {article}, author = {Meribout, Mahmoud and Baobaid, Asma and Khaoua, Mohammed Ould and Tiwari, Varun Kumar and Pena, Juan Pablo}, doi = {10.1109/ACCESS.2022.3175496}, journal = {IEEE Access} }
@article{ title = {Efficient Edge-AI Application Deployment for FPGAs†}, type = {article}, year = {2022}, keywords = {CNN,DNN,FPGA,KV260,Kria,MPSoC,PYNQ,artificial intelligence,deep learning,edge-AI}, volume = {13}, month = {6}, publisher = {MDPI}, day = {1}, id = {db06f9ab-51f6-3403-b3e7-98fa30442f21}, created = {2023-11-16T11:41:18.291Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:16:39.873Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {Presents multple NN models run on Kria KV260 Xilinx FPGA<br/>15 citations - Greek University of Patras}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Field Programmable Gate Array (FPGA) accelerators have been widely adopted for artificial intelligence (AI) applications on edge devices (Edge-AI) utilizing Deep Neural Networks (DNN) architectures. FPGAs have gained their reputation due to the greater energy efficiency and high parallelism than microcontrollers (MCU) and graphical processing units (GPU), while they are easier to develop and more reconfigurable than the Application Specific Integrated Circuit (ASIC). The development and building of AI applications on resource constraint devices such as FPGAs remains a challenge, however, due to the co-design approach, which requires a valuable expertise in low-level hardware design and in software development. This paper explores the efficacy and the dynamic deployment of hardware accelerated applications on the Kria KV260 development platform based on the Xilinx Kria K26 system-on-module (SoM), which includes a Zynq multiprocessor system-on-chip (MPSoC). The platform supports the Python-based PYNQ framework and maintains a high level of versatility with the support of custom bitstreams (overlays). The demonstration proved the reconfigurabibilty and the overall ease of implementation with low-footprint machine learning (ML) algorithms.}, bibtype = {article}, author = {Kalapothas, Stavros and Flamis, Georgios and Kitsos, Paris}, doi = {10.3390/info13060279}, journal = {Information (Switzerland)}, number = {6} }
@inproceedings{ title = {Semantic Similarity Metrics for Evaluating Source Code Summarization}, type = {inproceedings}, year = {2022}, keywords = {automatic documentation generation,evaluation metrics,source code summarization}, pages = {36-47}, volume = {2022-March}, publisher = {IEEE Computer Society}, id = {5d6e4153-3de9-3a31-9789-4a23681d1701}, created = {2024-02-07T10:12:11.670Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-02-07T10:56:03.321Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {CNN-FPGA tools:<br/><b>fpgaConvNet</b><br/><br/>Only tool with <b>open source</b> implementation:<br/>https://github.com/AlexMontgomerie/fpgaconvnet-model<br/>https://github.com/AlexMontgomerie/fpgaconvnet-tutorial<br/>https://github.com/Yu-Zhewen/fpgaconvnet-torch<br/><br/>Based on streaming arch, processing stages per layer.<br/>Maps Inception, residual and dense hardware blocks.<br/><br/>Synchronous Dataflow (SDF) model, where a static schedule is generated to drive the datapath.<br/><br/>fpgaConvNet utilises a custom global optimiser based on the Simulated Annealing algorithm.<br/><br/><br/><br/>}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Source code summarization involves creating brief descriptions of source code in natural language. These descriptions are a key component of software documentation such as JavaDocs. Automatic code summarization is a prized target of software engineering research, due to the high value summaries have to programmers and the simultaneously high cost of writing and maintaining documentation by hand. Current work is almost all based on machine models trained via big data input. Large datasets of examples of code and summaries of that code are used to train an e.g. encoder-decoder neural model. Then the output predictions of the model are evaluated against a set of reference summaries. The input is code not seen by the model, and the prediction is compared to a reference. The means by which a prediction is compared to a reference is essentially word overlap, calculated via a metric such as BLEU or ROUGE. The problem with using word overlap is that not all words in a sentence have the same importance, and many words have synonyms. The result is that calculated similarity may not match the perceived similarity by human readers. In this paper, we conduct an experiment to measure the degree to which various word overlap metrics correlate to human-rated similarity of predicted and reference summaries. We evaluate alternatives based on current work in semantic similarity metrics and propose recommendations for evaluation of source code summarization.}, bibtype = {inproceedings}, author = {Haque, Sakib and Eberhart, Zachary and Bansal, Aakash and McMillan, Collin}, doi = {10.1145/nnnnnnn.nnnnnnn}, booktitle = {IEEE International Conference on Program Comprehension} }
@article{ title = {Real-time semantic segmentation on FPGAs for autonomous vehicles with hls4ml}, type = {article}, year = {2022}, keywords = {FPGA,autonomous vehicles,computer vision,deep learning,hls4ml,machine learning,semantic segmentation}, volume = {3}, month = {12}, publisher = {Institute of Physics}, day = {1}, id = {a5428705-4e91-358a-b683-ee5fb1ade2cc}, created = {2024-02-07T11:50:23.043Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-02-07T11:56:06.183Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {Materials Methods<br/><br/><b>Xilinx ZCU102 </b>evaluation board<br/><b>Cityscapes</b> dataset for segmentation<br/>Efficient Neural Network (ENet) used<br/><br/><b>AutoQKeras</b> library<br/>QKeras-to-hls4ml interface for FPGA deployment<br/><br/><b>hls4ml</b> library has an implementation of convolutional layers}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {In this paper, we investigate how field programmable gate arrays can serve as hardware accelerators for real-time semantic segmentation tasks relevant for autonomous driving. Considering compressed versions of the ENet convolutional neural network architecture, we demonstrate a fully-on-chip deployment with a latency of 4.9 ms per image, using less than 30% of the available resources on a Xilinx ZCU102 evaluation board. The latency is reduced to 3 ms per image when increasing the batch size to ten, corresponding to the use case where the autonomous vehicle receives inputs from multiple cameras simultaneously. We show, through aggressive filter reduction and heterogeneous quantization-aware training, and an optimized implementation of convolutional layers, that the power consumption and resource utilization can be significantly reduced while maintaining accuracy on the Cityscapes dataset.}, bibtype = {article}, author = {Ghielmetti, Nicolò and Loncar, Vladimir and Pierini, Maurizio and Roed, Marcel and Summers, Sioni and Aarrestad, Thea and Petersson, Christoffer and Linander, Hampus and Ngadiuba, Jennifer and Lin, Kelvin and Harris, Philip}, doi = {10.1088/2632-2153/ac9cb5}, journal = {Machine Learning: Science and Technology}, number = {4} }
@article{ title = {Estimation of 2D Bounding Box Orientation with Convex-Hull Points - A Quantitative Evaluation on Accuracy and Efficiency}, type = {article}, year = {2021}, pages = {945-950}, id = {ae23b22a-c103-3616-800c-6dadaab5d98a}, created = {2021-01-28T12:21:22.357Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-29T13:17:54.524Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {471f331d-8560-4b9e-b910-e5f849b6fcfd}, private_publication = {false}, bibtype = {article}, author = {Liu, Yang and Liu, Bingbing and Zhang, Hongbo}, doi = {10.1109/iv47402.2020.9304788}, number = {Iv} }
@article{ title = {PV-RCNN++: Point-Voxel Feature Set Abstraction With Local Vector Representation for 3D Object Detection}, type = {article}, year = {2021}, pages = {1-17}, websites = {http://arxiv.org/abs/2102.00463}, id = {f8ad32e7-f771-39cc-a534-0e2ec29e08c4}, created = {2021-03-04T15:41:23.595Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-24T15:42:26.064Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Shi2021}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {3D object detection is receiving increasing attention from both industry and academia thanks to its wide applications in various fields. In this paper, we propose the Point-Voxel Region based Convolution Neural Networks (PV-RCNNs) for accurate 3D detection from point clouds. First, we propose a novel 3D object detector, PV-RCNN-v1, which employs the voxel-to-keypoint scene encoding and keypoint-to-grid RoI feature abstraction two novel steps. These two steps deeply incorporate both 3D voxel CNN and PointNet-based set abstraction for learning discriminative point-cloud features. Second, we propose a more advanced framework, PV-RCNN-v2, for more efficient and accurate 3D detection. It consists of two major improvements, where the first one is the sectorized proposal-centric strategy for efficiently producing more representative and uniformly distributed keypoints, and the second one is the VectorPool aggregation to replace set abstraction for better aggregating local point-cloud features with much less resource consumption. With these two major modifications, our PV-RCNN-v2 runs more than twice as fast as the v1 version while still achieving better performance on the large-scale Waymo Open Dataset with 150m * 150m detection range. Extensive experiments demonstrate that our proposed PV-RCNNs significantly outperform previous state-of-the-art 3D detection methods on both the Waymo Open Dataset and the highly-competitive KITTI benchmark.}, bibtype = {article}, author = {Shi, Shaoshuai and Jiang, Li and Deng, Jiajun and Wang, Zhe and Guo, Chaoxu and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng} }
@article{ title = {FPS-Net: A Convolutional Fusion Network for Large-Scale LiDAR Point Cloud Segmentation}, type = {article}, year = {2021}, keywords = {autonomous driving,lidar,point cloud,scene,semantic segmentation,spherical projection}, websites = {http://arxiv.org/abs/2103.00738}, id = {18de3f0a-63c2-3512-a357-6acaf1f5ecb9}, created = {2021-03-04T15:41:23.623Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-06T08:50:44.516Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Scene understanding based on LiDAR point cloud is an essential task for autonomous cars to drive safely, which often employs spherical projection to map 3D point cloud into multi-channel 2D images for semantic segmentation. Most existing methods simply stack different point attributes/modalities (e.g. coordinates, intensity, depth, etc.) as image channels to increase information capacity, but ignore distinct characteristics of point attributes in different image channels. We design FPS-Net, a convolutional fusion network that exploits the uniqueness and discrepancy among the projected image channels for optimal point cloud segmentation. FPS-Net adopts an encoder-decoder structure. Instead of simply stacking multiple channel images as a single input, we group them into different modalities to first learn modality-specific features separately and then map the learned features into a common high-dimensional feature space for pixel-level fusion and learning. Specifically, we design a residual dense block with multiple receptive fields as a building block in the encoder which preserves detailed information in each modality and learns hierarchical modality-specific and fused features effectively. In the FPS-Net decoder, we use a recurrent convolution block likewise to hierarchically decode fused features into output space for pixel-level classification. Extensive experiments conducted on two widely adopted point cloud datasets show that FPS-Net achieves superior semantic segmentation as compared with state-of-the-art projection-based methods. In addition, the proposed modality fusion idea is compatible with typical projection-based methods and can be incorporated into them with consistent performance improvements.}, bibtype = {article}, author = {Xiao, Aoran and Yang, Xiaofei and Lu, Shijian and Guan, Dayan and Huang, Jiaxing} }
@article{ title = {Pruning and Quantization for Deep Neural Network Acceleration: A Survey}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2101.09671}, month = {1}, day = {24}, id = {72da8552-3a9f-3839-9b1d-4cdf5aff002f}, created = {2021-06-14T08:43:51.909Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:44:38.180Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4}, private_publication = {false}, abstract = {Deep neural networks have been applied in many applications exhibiting extraordinary abilities in the field of computer vision. However, complex network architectures challenge efficient real-time deployment and require significant computation resources and energy costs. These challenges can be overcome through optimizations such as network compression. Network compression can often be realized with little loss of accuracy. In some cases accuracy may even improve. This paper provides a survey on two types of network compression: pruning and quantization. Pruning can be categorized as static if it is performed offline or dynamic if it is performed at run-time. We compare pruning techniques and describe criteria used to remove redundant computations. We discuss trade-offs in element-wise, channel-wise, shape-wise, filter-wise, layer-wise and even network-wise pruning. Quantization reduces computations by reducing the precision of the datatype. Weights, biases, and activations may be quantized typically to 8-bit integers although lower bit width implementations are also discussed including binary neural networks. Both pruning and quantization can be used independently or combined. We compare current techniques, analyze their strengths and weaknesses, present compressed network accuracy results on a number of frameworks, and provide practical guidance for compressing networks.}, bibtype = {article}, author = {Liang, Tailin and Glossner, John and Wang, Lei and Shi, Shaobo} }
@article{ title = {Point Cloud Learning with Transformer}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.13636}, id = {efbfd798-a585-3431-aa1b-a8b5f4961381}, created = {2021-06-21T08:44:26.178Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T06:29:25.133Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7,597192a3-7679-4832-a554-980990d8ac9b,d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Remarkable performance from Transformer networks in Natural Language Processing promote the development of these models in dealing with computer vision tasks such as image recognition and segmentation. In this paper, we introduce a novel framework, called Multi-level Multi-scale Point Transformer (MLMSPT) that works directly on the irregular point clouds for representation learning. Specifically, a point pyramid transformer is investigated to model features with diverse resolutions or scales we defined, followed by a multi-level transformer module to aggregate contextual information from different levels of each scale and enhance their interactions. While a multi-scale transformer module is designed to capture the dependencies among representations across different scales. Extensive evaluation on public benchmark datasets demonstrate the effectiveness and the competitive performance of our methods on 3D shape classification, part segmentation and semantic segmentation tasks.}, bibtype = {article}, author = {Han, Xian-Feng and Kuang, Yu-Jia and Xiao, Guo-Qiang}, number = {3} }
@article{ title = {PCT: Point cloud transformer}, type = {article}, year = {2021}, keywords = {3D computer vision,Transformer,deep learning,point cloud processing}, pages = {187-199}, volume = {7}, id = {a5d79181-17da-3f52-bb47-220699339308}, created = {2021-06-21T08:44:26.372Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-15T11:24:42.580Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7,597192a3-7679-4832-a554-980990d8ac9b,07e07de9-bcac-4934-a82b-d0aff540e56d,11276190-b8fe-4c3a-a42f-f604438ad4db,d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {The irregular domain and lack of ordering make it challenging to design deep neural networks for point cloud processing. This paper presents a novel framework named Point Cloud Transformer (PCT) for point cloud learning. PCT is based on Transformer, which achieves huge success in natural language processing and displays great potential in image processing. It is inherently permutation invariant for processing a sequence of points, making it well-suited for point cloud learning. To better capture local context within the point cloud, we enhance input embedding with the support of farthest point sampling and nearest neighbor search. Extensive experiments demonstrate that the PCT achieves the state-of-the-art performance on shape classification, part segmentation, semantic segmentation, and normal estimation tasks.}, bibtype = {article}, author = {Guo, Meng Hao and Cai, Jun Xiong and Liu, Zheng Ning and Mu, Tai Jiang and Martin, Ralph R. and Hu, Shi Min}, doi = {10.1007/s41095-021-0229-5}, journal = {Computational Visual Media}, number = {2} }
@article{ title = {Geometric Deep Learning: Grids, Groups, Graphs, Geodesics, and Gauges}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.13478}, id = {1e903e3b-d888-3d47-9a2a-2c6cfb04c4e9}, created = {2021-07-12T09:40:52.521Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-01T13:11:54.912Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {368c6572-df92-4840-8400-80e7c9ee2dd7,20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {The last decade has witnessed an experimental revolution in data science and machine learning, epitomised by deep learning methods. Indeed, many high-dimensional learning tasks previously thought to be beyond reach -- such as computer vision, playing Go, or protein folding -- are in fact feasible with appropriate computational scale. Remarkably, the essence of deep learning is built from two simple algorithmic principles: first, the notion of representation or feature learning, whereby adapted, often hierarchical, features capture the appropriate notion of regularity for each task, and second, learning by local gradient-descent type methods, typically implemented as backpropagation. While learning generic functions in high dimensions is a cursed estimation problem, most tasks of interest are not generic, and come with essential pre-defined regularities arising from the underlying low-dimensionality and structure of the physical world. This text is concerned with exposing these regularities through unified geometric principles that can be applied throughout a wide spectrum of applications. Such a 'geometric unification' endeavour, in the spirit of Felix Klein's Erlangen Program, serves a dual purpose: on one hand, it provides a common mathematical framework to study the most successful neural network architectures, such as CNNs, RNNs, GNNs, and Transformers. On the other hand, it gives a constructive procedure to incorporate prior physical knowledge into neural architectures and provide principled way to build future architectures yet to be invented.}, bibtype = {article}, author = {Bronstein, Michael M. and Bruna, Joan and Cohen, Taco and Veličković, Petar} }
@article{ title = {Random Features Strengthen Graph Neural Networks}, type = {article}, year = {2021}, pages = {333-341}, id = {80b59a54-c323-3d62-b01f-b43ca732e4ab}, created = {2021-07-12T10:19:36.603Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:56.925Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Graph neural networks (GNNs) are powerful machine learning models for various graph learning tasks. Recently, the limitations of the expressive power of various GNN models have been revealed. For example, GNNs cannot distinguish some non-isomorphic graphs and they cannot learn efficient graph algorithms. In this paper, we demonstrate that GNNs become powerful just by adding a random feature to each node. We prove that the random features enable GNNs to learn almost optimal polynomial-time approximation algorithms for the minimum dominating set problem and maximum matching problem in terms of approximation ratios. The main advantage of our method is that it can be combined with off-the-shelf GNN models with slight modifications. Through experiments, we show that the addition of random features enables GNNs to solve various problems that normal GNNs, including the graph convolutional networks (GCNs) and graph isomorphism networks (GINs), cannot solve.}, bibtype = {article}, author = {Sato, Ryoma and Yamada, Makoto and Kashima, Hisashi}, doi = {10.1137/1.9781611976700.38}, journal = {Proceedings of the 2021 SIAM International Conference on Data Mining (SDM)} }
@article{ title = {Spectral Normalisation for Deep Reinforcement Learning: an Optimisation Perspective}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.05246}, id = {01e75554-b7ad-3ce9-bd78-1b4aad8f581e}, created = {2021-07-12T14:15:35.205Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:09.133Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Most of the recent deep reinforcement learning advances take an RL-centric perspective and focus on refinements of the training objective. We diverge from this view and show we can recover the performance of these developments not by changing the objective, but by regularising the value-function estimator. Constraining the Lipschitz constant of a single layer using spectral normalisation is sufficient to elevate the performance of a Categorical-DQN agent to that of a more elaborated \rainbow agent on the challenging Atari domain. We conduct ablation studies to disentangle the various effects normalisation has on the learning dynamics and show that is sufficient to modulate the parameter updates to recover most of the performance of spectral normalisation. These findings hint towards the need to also focus on the neural component and its learning dynamics to tackle the peculiarities of Deep Reinforcement Learning.}, bibtype = {article}, author = {Gogianu, Florin and Berariu, Tudor and Rosca, Mihaela and Clopath, Claudia and Busoniu, Lucian and Pascanu, Razvan} }
@article{ title = {Drawing Multiple Augmentation Samples Per Image During Training Efficiently Decreases Test Error}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.13343}, id = {f7215b77-2673-3979-9ac2-c114104553b5}, created = {2021-07-12T14:15:35.999Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:00.515Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {In computer vision, it is standard practice to draw a single sample from the data augmentation procedure for each unique image in the mini-batch, however it is not clear whether this choice is optimal for generalization. In this work, we provide a detailed empirical evaluation of how the number of augmentation samples per unique image influences performance on held out data. Remarkably, we find that drawing multiple samples per image consistently enhances the test accuracy achieved for both small and large batch training, despite reducing the number of unique training examples in each mini-batch. This benefit arises even when different augmentation multiplicities perform the same number of parameter updates and gradient evaluations. Our results suggest that, although the variance in the gradient estimate arising from subsampling the dataset has an implicit regularization benefit, the variance which arises from the data augmentation process harms test accuracy. By applying augmentation multiplicity to the recently proposed NFNet model family, we achieve a new ImageNet state of the art of 86.8$\%$ top-1 w/o extra data.}, bibtype = {article}, author = {Fort, Stanislav and Brock, Andrew and Pascanu, Razvan and De, Soham and Smith, Samuel L.} }
@article{ title = {Transformers for Computer Vision}, type = {article}, year = {2021}, id = {ecca39fc-9a23-3448-8923-8baec26a709e}, created = {2021-07-19T10:48:23.785Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:48:52.671Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, bibtype = {article}, author = {Dosovitskiy, Alexey} }
@article{ title = {Tutorial on Variational Autoencoders Why are VAEs interesting ?}, type = {article}, year = {2021}, id = {0fdbf08f-0e70-32a9-a4df-427e0bc1e959}, created = {2021-07-19T10:48:23.904Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:49:22.591Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, bibtype = {article}, author = {Nagy, David and Szepesvari, David} }
@article{ title = {Analysis of voxel-based 3D object detection methods efficiency for real-time embedded systems}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.10316}, id = {936cdc1f-7a9b-3c6d-b6d1-bffa2e3897fe}, created = {2021-07-20T12:45:29.362Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-08T10:20:27.897Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Oleksiienko2021}, folder_uuids = {cf9189f6-f354-4337-8aaf-a5f12cbf8660,5cd4d7ce-c2fb-4e91-ab80-35deeb123df5}, private_publication = {false}, abstract = {Real-time detection of objects in the 3D scene is one of the tasks an autonomous agent needs to perform for understanding its surroundings. While recent Deep Learning-based solutions achieve satisfactory performance, their high computational cost renders their application in real-life settings in which computations need to be performed on embedded platforms intractable. In this paper, we analyze the efficiency of two popular voxel-based 3D object detection methods providing a good compromise between high performance and speed based on two aspects, their ability to detect objects located at large distances from the agent and their ability to operate in real time on embedded platforms equipped with high-performance GPUs. Our experiments show that these methods mostly fail to detect distant small objects due to the sparsity of the input point clouds at large distances. Moreover, models trained on near objects achieve similar or better performance compared to those trained on all objects in the scene. This means that the models learn object appearance representations mostly from near objects. Our findings suggest that a considerable part of the computations of existing methods is focused on locations of the scene that do not contribute with successful detection. This means that the methods can achieve a speed-up of $40$-$60\%$ by restricting operation to near objects while not sacrificing much in performance.}, bibtype = {article}, author = {Oleksiienko, Illia and Iosifidis, Alexandros} }
@article{ title = {SIMPLE SPECTRAL GRAPH CONVOLUTION}, type = {article}, year = {2021}, pages = {1-15}, id = {77317523-de9e-3f5e-8a7f-d79368b3ba82}, created = {2021-08-04T09:51:19.985Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-04T13:05:08.471Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Zhu, Hao and Koniusz, Piotr} }
@article{ title = {A Comprehensive Survey on Graph Neural Networks}, type = {article}, year = {2021}, keywords = {Deep learning,graph autoencoder (GAE),graph convolutional networks (GCNs),graph neural networks (GNNs),graph representation learning,network embedding}, pages = {4-24}, volume = {32}, publisher = {IEEE}, id = {9bdff10a-f897-3446-8652-8a28b7f6b8c5}, created = {2021-08-17T08:06:02.968Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-30T15:38:16.106Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wu2021}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, abstract = {Deep learning has revolutionized many machine learning tasks in recent years, ranging from image classification and video processing to speech recognition and natural language understanding. The data in these tasks are typically represented in the Euclidean space. However, there is an increasing number of applications, where data are generated from non-Euclidean domains and are represented as graphs with complex relationships and interdependency between objects. The complexity of graph data has imposed significant challenges on the existing machine learning algorithms. Recently, many studies on extending deep learning approaches for graph data have emerged. In this article, we provide a comprehensive overview of graph neural networks (GNNs) in data mining and machine learning fields. We propose a new taxonomy to divide the state-of-The-Art GNNs into four categories, namely, recurrent GNNs, convolutional GNNs, graph autoencoders, and spatial-Temporal GNNs. We further discuss the applications of GNNs across various domains and summarize the open-source codes, benchmark data sets, and model evaluation of GNNs. Finally, we propose potential research directions in this rapidly growing field.}, bibtype = {article}, author = {Wu, Zonghan and Pan, Shirui and Chen, Fengwen and Long, Guodong and Zhang, Chengqi and Yu, Philip S.}, doi = {10.1109/TNNLS.2020.2978386}, journal = {IEEE Transactions on Neural Networks and Learning Systems}, number = {1} }
@article{ title = {Dynamic Convolution for 3D Point Cloud Instance Segmentation}, type = {article}, year = {2021}, keywords = {Index Terms-Point cloud,dynamic convolution,instance segmentation}, websites = {https://arxiv.org/abs/2107.08392v1}, month = {7}, day = {18}, id = {cf16b23f-c3ca-3dcc-9cb9-abc6640af6b7}, created = {2021-08-24T10:26:44.691Z}, accessed = {2021-08-24}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-24T10:26:50.322Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, abstract = {We propose an approach to instance segmentation from 3D point clouds based on dynamic convolution. This enables it to adapt, at inference, to varying feature and object scales. Doing so avoids some pitfalls of bottom up approaches, including a dependence on hyper-parameter tuning and heuristic post-processing pipelines to compensate for the inevitable variability in object sizes, even within a single scene. The representation capability of the network is greatly improved by gathering homogeneous points that have identical semantic categories and close votes for the geometric centroids. Instances are then decoded via several simple convolution layers, where the parameters are generated conditioned on the input. The proposed approach is proposal-free, and instead exploits a convolution process that adapts to the spatial and semantic characteristics of each instance. A light-weight transformer, built on the bottleneck layer, allows the model to capture long-range dependencies, with limited computational overhead. The result is a simple, efficient, and robust approach that yields strong performance on various datasets: ScanNetV2, S3DIS, and PartNet. The consistent improvements on both voxel- and point-based architectures imply the effectiveness of the proposed method. Code is available at: https://git.io/DyCo3D}, bibtype = {article}, author = {He, Tong and Shen, Chunhua and Hengel, Anton van den} }
@article{ title = {Tutorial on Variational Autoencoders}, type = {article}, year = {2021}, keywords = {neural networks,prediction,structured,unsupervised learning,variational autoencoders}, pages = {1-23}, id = {dfe81fad-91fb-374c-80c1-e7f650b81f9a}, created = {2021-08-30T18:48:39.015Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.729Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Mellon2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Mellon, Carnegie and Berkeley, U C} }
@article{ title = {AutoFormer: Searching Transformers for Visual Recognition}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2107.00651v1}, month = {7}, day = {1}, id = {7028c756-1ec0-3190-a4a1-ac290f93c62a}, created = {2021-08-31T10:52:44.069Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-02T07:26:36.191Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Recently, pure transformer-based models have shown great potentials for vision tasks such as image classification and detection. However, the design of transformer networks is challenging. It has been observed that the depth, embedding dimension, and number of heads can largely affect the performance of vision transformers. Previous models configure these dimensions based upon manual crafting. In this work, we propose a new one-shot architecture search framework, namely AutoFormer, dedicated to vision transformer search. AutoFormer entangles the weights of different blocks in the same layers during supernet training. Benefiting from the strategy, the trained supernet allows thousands of subnets to be very well-trained. Specifically, the performance of these subnets with weights inherited from the supernet is comparable to those retrained from scratch. Besides, the searched models, which we refer to AutoFormers, surpass the recent state-of-the-arts such as ViT and DeiT. In particular, AutoFormer-tiny/small/base achieve 74.7%/81.7%/82.4% top-1 accuracy on ImageNet with 5.7M/22.9M/53.7M parameters, respectively. Lastly, we verify the transferability of AutoFormer by providing the performance on downstream benchmarks and distillation experiments. Code and models are available at https://github.com/microsoft/AutoML.}, bibtype = {article}, author = {Chen, Minghao and Peng, Houwen and Fu, Jianlong and Ling, Haibin} }
@article{ title = {Improving 3D Object Detection with Channel-wise Transformer}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2108.10723v1}, month = {8}, day = {23}, id = {f3e7bbf6-a64e-3d7e-b0ce-45a42c7e4f16}, created = {2021-08-31T11:11:24.759Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:29.050Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Though 3D object detection from point clouds has achieved rapid progress in recent years, the lack of flexible and high-performance proposal refinement remains a great hurdle for existing state-of-the-art two-stage detectors. Previous works on refining 3D proposals have relied on human-designed components such as keypoints sampling, set abstraction and multi-scale feature fusion to produce powerful 3D object representations. Such methods, however, have limited ability to capture rich contextual dependencies among points. In this paper, we leverage the high-quality region proposal network and a Channel-wise Transformer architecture to constitute our two-stage 3D object detection framework (CT3D) with minimal hand-crafted design. The proposed CT3D simultaneously performs proposal-aware embedding and channel-wise context aggregation for the point features within each proposal. Specifically, CT3D uses proposal's keypoints for spatial contextual modelling and learns attention propagation in the encoding module, mapping the proposal to point embeddings. Next, a new channel-wise decoding module enriches the query-key interaction via channel-wise re-weighting to effectively merge multi-level contexts, which contributes to more accurate object predictions. Extensive experiments demonstrate that our CT3D method has superior performance and excellent scalability. Remarkably, CT3D achieves the AP of 81.77% in the moderate car category on the KITTI test 3D detection benchmark, outperforms state-of-the-art 3D detectors.}, bibtype = {article}, author = {Sheng, Hualian and Cai, Sijia and Liu, Yuan and Deng, Bing and Huang, Jianqiang and Hua, Xian-Sheng and Zhao, Min-Jian} }
@article{ title = {Bottleneck Transformers for Visual Recognition}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2101.11605v2}, month = {1}, day = {27}, id = {95eb8f27-2c19-34e3-8a5b-9462be413192}, created = {2021-09-01T07:34:53.797Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-01T07:34:56.861Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {We present BoTNet, a conceptually simple yet powerful backbone architecture that incorporates self-attention for multiple computer vision tasks including image classification, object detection and instance segmentation. By just replacing the spatial convolutions with global self-attention in the final three bottleneck blocks of a ResNet and no other changes, our approach improves upon the baselines significantly on instance segmentation and object detection while also reducing the parameters, with minimal overhead in latency. Through the design of BoTNet, we also point out how ResNet bottleneck blocks with self-attention can be viewed as Transformer blocks. Without any bells and whistles, BoTNet achieves 44.4% Mask AP and 49.7% Box AP on the COCO Instance Segmentation benchmark using the Mask R-CNN framework; surpassing the previous best published single model and single scale results of ResNeSt evaluated on the COCO validation set. Finally, we present a simple adaptation of the BoTNet design for image classification, resulting in models that achieve a strong performance of 84.7% top-1 accuracy on the ImageNet benchmark while being up to 1.64x faster in compute time than the popular EfficientNet models on TPU-v3 hardware. We hope our simple and effective approach will serve as a strong baseline for future research in self-attention models for vision}, bibtype = {article}, author = {Srinivas, Aravind and Lin, Tsung-Yi and Parmar, Niki and Shlens, Jonathon and Abbeel, Pieter and Vaswani, Ashish} }
@article{ title = {An Attention Free Transformer}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2105.14103v1}, month = {5}, day = {28}, id = {a915cdb5-6e66-39e4-9ad0-0daeb2ad58d7}, created = {2021-09-01T08:03:58.653Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T07:05:08.596Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {We introduce Attention Free Transformer (AFT), an efficient variant of Transformers that eliminates the need for dot product self attention. In an AFT layer, the key and value are first combined with a set of learned position biases, the result of which is multiplied with the query in an element-wise fashion. This new operation has a memory complexity linear w.r.t. both the context size and the dimension of features, making it compatible to both large input and model sizes. We also introduce AFT-local and AFT-conv, two model variants that take advantage of the idea of locality and spatial weight sharing while maintaining global connectivity. We conduct extensive experiments on two autoregressive modeling tasks (CIFAR10 and Enwik8) as well as an image recognition task (ImageNet-1K classification). We show that AFT demonstrates competitive performance on all the benchmarks, while providing excellent efficiency at the same time.}, bibtype = {article}, author = {Zhai, Shuangfei and Talbott, Walter and Srivastava, Nitish and Huang, Chen and Goh, Hanlin and Zhang, Ruixiang and Susskind, Josh} }
@article{ title = {Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2101.11986v1}, month = {1}, day = {28}, id = {b1b7328f-2ca1-3670-97ce-8142b0a5d0a8}, created = {2021-09-01T08:06:14.613Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-01T08:06:23.156Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Transformers, which are popular for language modeling, have been explored for solving vision tasks recently, e.g., the Vision Transformers (ViT) for image classification. The ViT model splits each image into a sequence of tokens with fixed length and then applies multiple Transformer layers to model their global relation for classification. However, ViT achieves inferior performance compared with CNNs when trained from scratch on a midsize dataset (e.g., ImageNet). We find it is because: 1) the simple tokenization of input images fails to model the important local structure (e.g., edges, lines) among neighboring pixels, leading to its low training sample efficiency; 2) the redundant attention backbone design of ViT leads to limited feature richness in fixed computation budgets and limited training samples. To overcome such limitations, we propose a new Tokens-To-Token Vision Transformers (T2T-ViT), which introduces 1) a layer-wise Tokens-to-Token (T2T) transformation to progressively structurize the image to tokens by recursively aggregating neighboring Tokens into one Token (Tokens-to-Token), such that local structure presented by surrounding tokens can be modeled and tokens length can be reduced; 2) an efficient backbone with a deep-narrow structure for vision transformers motivated by CNN architecture design after extensive study. Notably, T2T-ViT reduces the parameter counts and MACs of vanilla ViT by 200\%, while achieving more than 2.5\% improvement when trained from scratch on ImageNet. It also outperforms ResNets and achieves comparable performance with MobileNets when directly training on ImageNet. For example, T2T-ViT with ResNet50 comparable size can achieve 80.7\% top-1 accuracy on ImageNet. (Code: https://github.com/yitu-opensource/T2T-ViT)}, bibtype = {article}, author = {Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng} }
@article{ title = {Recent Advances in Variational Autoencoders with Representation Learning for Biomedical Informatics: A Survey}, type = {article}, year = {2021}, keywords = {Deep learning,biomedical informatics,data representation,generative models,latent space,representation learning,unsupervised learning,variational autoencoders (VAEs)}, pages = {4939-4956}, volume = {9}, id = {3d37dbea-cc6c-3b76-9071-c8fcf066f56c}, created = {2021-09-02T05:25:53.249Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.833Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wei2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Variational autoencoders (VAEs) are deep latent space generative models that have been immensely successful in multiple exciting applications in biomedical informatics such as molecular design, protein design, medical image classification and segmentation, integrated multi-omics data analyses, and large-scale biological sequence analyses, among others. The fundamental idea in VAEs is to learn the distribution of data in such a way that new meaningful data with more intra-class variations can be generated from the encoded distribution. The ability of VAEs to synthesize new data with more representation variance at state-of-art levels provides hope that the chronic scarcity of labeled data in the biomedical field can be resolved. Furthermore, VAEs have made nonlinear latent variable models tractable for modeling complex distributions. This has allowed for efficient extraction of relevant biomedical information from learned features for biological data sets, referred to as unsupervised feature representation learning. In this article, we review the various recent advancements in the development and application of VAEs for biomedical informatics. We discuss challenges and future opportunities for biomedical research with respect to VAEs.}, bibtype = {article}, author = {Wei, Ruoqi and Mahmood, Ausif}, doi = {10.1109/ACCESS.2020.3048309}, journal = {IEEE Access} }
@article{ title = {A comprehensive study of autoencoders' applications related to images}, type = {article}, year = {2021}, keywords = {Autoencoders,CNN,DNN,Generative models,VAE}, pages = {43-54}, volume = {2845}, id = {e5e7890f-095c-3d0f-b21c-4f32a51e3474}, created = {2021-09-02T05:25:53.358Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.860Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kovenko2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8ab28090-4ad8-4e96-920f-bd4344219976}, private_publication = {false}, abstract = {This article incorporates a comprehensive study of autoencoders' applications related to images. First of all, a vanilla autoencoder is described along with details of its architecture and training procedure. Secondly, main methods for regularization of it are exposed, such as dropout and additive gaussian noise. The applications of autoencoders such as image morphing, reconstruction and search are shown. Then, the VAE (variational autoencoder) is highlighted. Main applications of it such as outliers detection and image generation are described. Finally, it's shown that using warm-up for VAE with respect to KL loss gives much more plausible results in terms of image generation.}, bibtype = {article}, author = {Kovenko, Volodymyr and Bogach, Ilona}, journal = {CEUR Workshop Proceedings} }
@article{ title = {A Survey on Variational Autoencoders from a Green AI Perspective}, type = {article}, year = {2021}, keywords = {generative modeling,greenai,variational autoencoders}, volume = {2}, id = {4a000622-f6e4-3741-a3a0-14761a21e3e9}, created = {2021-09-02T05:25:53.376Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-17T07:10:38.620Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Asperti2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8ab28090-4ad8-4e96-920f-bd4344219976}, private_publication = {false}, abstract = {Variational Autoencoders (VAEs) are powerful generative models that merge elements from statistics and information theory with the flexibility offered by deep neural networks to efficiently solve the generation problem for high-dimensional data. The key insight of VAEs is to learn the latent distribution of data in such a way that new meaningful samples can be generated from it. This approach led to tremendous research and variations in the architectural design of VAEs, nourishing the recent field of research known as unsupervised representation learning. In this article, we provide a comparative evaluation of some of the most successful, recent variations of VAEs. We particularly focus the analysis on the energetic efficiency of the different models, in the spirit of the so-called Green AI, aiming both to reduce the carbon footprint and the financial cost of generative techniques. For each architecture, we provide its mathematical formulation, the ideas underlying its design, a detailed model description, a running implementation and quantitative results.}, bibtype = {article}, author = {Asperti, Andrea and Evangelista, Davide and Loli Piccolomini, Elena}, doi = {10.1007/s42979-021-00702-9}, journal = {SN Computer Science}, number = {4} }
@article{ title = {Random Feature Attention}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.02143}, month = {3}, day = {3}, id = {982c487a-79c0-3f2e-8c9b-03e280cf46e6}, created = {2021-09-03T07:05:48.453Z}, accessed = {2021-09-03}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T07:05:51.494Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Transformers are state-of-the-art models for a variety of sequence modeling tasks. At their core is an attention function which models pairwise interactions between the inputs at every timestep. While attention is powerful, it does not scale efficiently to long sequences due to its quadratic time and space complexity in the sequence length. We propose RFA, a linear time and space attention that uses random feature methods to approximate the softmax function, and explore its application in transformers. RFA can be used as a drop-in replacement for conventional softmax attention and offers a straightforward way of learning with recency bias through an optional gating mechanism. Experiments on language modeling and machine translation demonstrate that RFA achieves similar or better performance compared to strong transformer baselines. In the machine translation experiment, RFA decodes twice as fast as a vanilla transformer. Compared to existing efficient transformer variants, RFA is competitive in terms of both accuracy and efficiency on three long text classification datasets. Our analysis shows that RFA's efficiency gains are especially notable on long sequences, suggesting that RFA will be particularly useful in tasks that require working with large inputs, fast decoding speed, or low memory footprints.}, bibtype = {article}, author = {Peng, Hao and Pappas, Nikolaos and Yogatama, Dani and Schwartz, Roy and Smith, Noah A. and Kong, Lingpeng} }
@article{ title = {LambdaNetworks: Modeling Long-Range Interactions Without Attention}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2102.08602v1}, month = {2}, day = {17}, id = {19fb7926-8c5e-3c03-8917-a2ed605c820a}, created = {2021-09-03T07:06:05.730Z}, accessed = {2021-09-03}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T07:06:08.788Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {We present lambda layers -- an alternative framework to self-attention -- for capturing long-range interactions between an input and structured contextual information (e.g. a pixel surrounded by other pixels). Lambda layers capture such interactions by transforming available contexts into linear functions, termed lambdas, and applying these linear functions to each input separately. Similar to linear attention, lambda layers bypass expensive attention maps, but in contrast, they model both content and position-based interactions which enables their application to large structured inputs such as images. The resulting neural network architectures, LambdaNetworks, significantly outperform their convolutional and attentional counterparts on ImageNet classification, COCO object detection and COCO instance segmentation, while being more computationally efficient. Additionally, we design LambdaResNets, a family of hybrid architectures across different scales, that considerably improves the speed-accuracy tradeoff of image classification models. LambdaResNets reach excellent accuracies on ImageNet while being 3.2 - 4.4x faster than the popular EfficientNets on modern machine learning accelerators. When training with an additional 130M pseudo-labeled images, LambdaResNets achieve up to a 9.5x speed-up over the corresponding EfficientNet checkpoints.}, bibtype = {article}, author = {Bello, Irwan} }
@article{ title = {Cross-Modal Center Loss for 3D Cross-Modal Retrieval}, type = {article}, year = {2021}, pages = {3142-3151}, id = {4f31dd5a-26eb-32eb-aa9c-9696d494d166}, created = {2021-09-07T08:57:34.490Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-08T09:21:50.970Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307}, private_publication = {false}, abstract = {Cross-modal retrieval aims to learn discriminative and modal-invariant features for data from different modalities. Unlike the existing methods which usually learn from the features extracted by offline networks, in this paper, we propose an approach to jointly train the components of cross-modal retrieval framework with metadata, and enable the network to find optimal features. The proposed end-to-end framework is updated with three loss functions: 1) a novel cross-modal center loss to eliminate cross-modal discrepancy , 2) cross-entropy loss to maximize inter-class variations , and 3) mean-square-error loss to reduce modality variations. In particular, our proposed cross-modal center loss minimizes the distances of features from objects belonging to the same class across all modalities. Extensive experiments have been conducted on the retrieval tasks across multi-modalities including 2D image, 3D point cloud and mesh data. The proposed framework significantly out-performs the state-of-the-art methods for both cross-modal and in-domain retrieval for 3D objects on the ModelNet10 and ModelNet40 datasets.}, bibtype = {article}, author = {Jing, Longlong and Vahdani, Elahe and Tan, Jiaxing and Tian, Yingli} }
@article{ title = {Toward Unsupervised 3d Point Cloud Anomaly Detection Using Variational Autoencoder}, type = {article}, year = {2021}, pages = {3118-3122}, publisher = {IEEE}, id = {b5a0b82b-b3d7-3fc6-80ff-dc98f72e0e00}, created = {2021-09-09T14:35:21.257Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.861Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Masuda2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Masuda, Mana and Hachiuma, Ryo and Fujii, Ryo and Saito, Hideo and Sekikawa, Yusuke}, doi = {10.1109/icip42928.2021.9506795} }
@article{ title = {Advances in agriculture robotics: A state-of-the-art review and challenges ahead}, type = {article}, year = {2021}, keywords = {Agricultural robots,Agriculture 4.0,Precision agriculture}, pages = {1-31}, volume = {10}, id = {bc1196de-20a2-3dfb-8f7f-3e6d4c3e4636}, created = {2021-09-14T08:01:18.690Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-14T08:42:57.265Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Oliveira2021}, private_publication = {false}, abstract = {The constant advances in agricultural robotics aim to overcome the challenges imposed by population growth, accelerated urbanization, high competitiveness of high-quality products, environmental preservation and a lack of qualified labor. In this sense, this review paper surveys the main existing applications of agricultural robotic systems for the execution of land preparation before planting, sowing, planting, plant treatment, harvesting, yield estimation and phenotyping. In general, all robots were evaluated according to the following criteria: its locomotion system, what is the final application, if it has sensors, robotic arm and/or computer vision algorithm, what is its development stage and which country and continent they belong. After evaluating all similar characteristics, to expose the research trends, common pitfalls and the characteristics that hinder commercial development, and discover which countries are investing into Research and Development (R&D) in these technologies for the future, four major areas that need future research work for enhancing the state of the art in smart agriculture were highlighted: locomotion systems, sensors, computer vision algorithms and communication technologies. The results of this research suggest that the investment in agricultural robotic systems allows to achieve short—harvest monitoring—and long-term objectives—yield estimation.}, bibtype = {article}, author = {Oliveira, Luiz F.P. and Moreira, António P. and Silva, Manuel F.}, doi = {10.3390/robotics10020052}, journal = {Robotics}, number = {2} }
@article{ title = {Transformers in Vision: A Survey}, type = {article}, year = {2021}, pages = {1-28}, websites = {http://arxiv.org/abs/2101.01169-----}, id = {8ea81367-abc1-3f3f-804e-3676eeacc7a5}, created = {2021-09-24T06:12:48.936Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-24T06:12:55.387Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Astounding results from Transformer models on natural language tasks have intrigued the vision community to study their application to computer vision problems. Among their salient benefits, Transformers enable modeling long dependencies between input sequence elements and support parallel processing of sequence as compared to recurrent networks e.g., Long short-term memory (LSTM). Different from convolutional networks, Transformers require minimal inductive biases for their design and are naturally suited as set-functions. Furthermore, the straightforward design of Transformers allows processing multiple modalities (e.g., images, videos, text and speech) using similar processing blocks and demonstrates excellent scalability to very large capacity networks and huge datasets. These strengths have led to exciting progress on a number of vision tasks using Transformer networks. This survey aims to provide a comprehensive overview of the Transformer models in the computer vision discipline. We start with an introduction to fundamental concepts behind the success of Transformers i.e., self-attention, large-scale pre-training, and bidirectional encoding. We then cover extensive applications of transformers in vision including popular recognition tasks (e.g., image classification, object detection, action recognition, and segmentation), generative modeling, multi-modal tasks (e.g., visual-question answering, visual reasoning, and visual grounding), video processing (e.g., activity recognition, video forecasting), low-level vision (e.g., image super-resolution, image enhancement, and colorization) and 3D analysis (e.g., point cloud classification and segmentation). We compare the respective advantages and limitations of popular techniques both in terms of architectural design and their experimental value. Finally, we provide an analysis on open research directions and possible future works.}, bibtype = {article}, author = {Khan, Salman and Naseer, Muzammal and Hayat, Munawar and Zamir, Syed Waqas and Khan, Fahad Shahbaz and Shah, Mubarak} }
@article{ title = {Attention Models for Point Clouds in Deep Learning: A Survey}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2102.10788}, id = {74d676bb-c068-3a48-8a5a-925bc078831d}, created = {2021-09-27T07:36:18.508Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:03.344Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Recently, the advancement of 3D point clouds in deep learning has attracted intensive research in different application domains such as computer vision and robotic tasks. However, creating feature representation of robust, discriminative from unordered and irregular point clouds is challenging. In this paper, our ultimate goal is to provide a comprehensive overview of the point clouds feature representation which uses attention models. More than 75+ key contributions in the recent three years are summarized in this survey, including the 3D objective detection, 3D semantic segmentation, 3D pose estimation, point clouds completion etc. We provide a detailed characterization (1) the role of attention mechanisms, (2) the usability of attention models into different tasks, (3) the development trend of key technology.}, bibtype = {article}, author = {Wang, Xu and Jin, Yi and Cen, Yigang and Wang, Tao and Li, Yidong} }
@article{ title = {GCN-Denoiser: Mesh Denoising with Graph Convolutional Networks}, type = {article}, year = {2021}, keywords = {Mesh denoising, graph convolutional networks, casc}, volume = {40}, websites = {http://arxiv.org/abs/2108.05128}, publisher = {Association for Computing Machinery}, id = {23402ab5-4c00-3c47-a4ab-233db1b098da}, created = {2021-09-27T07:36:18.511Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:03.611Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {In this paper, we present GCN-Denoiser, a novel feature-preserving mesh denoising method based on graph convolutional networks (GCNs). Unlike previous learning-based mesh denoising methods that exploit hand-crafted or voxel-based representations for feature learning, our method explores the structure of a triangular mesh itself and introduces a graph representation followed by graph convolution operations in the dual space of triangles. We show such a graph representation naturally captures the geometry features while being lightweight for both training and inference. To facilitate effective feature learning, our network exploits both static and dynamic edge convolutions, which allow us to learn information from both the explicit mesh structure and potential implicit relations among unconnected neighbors. To better approximate an unknown noise function, we introduce a cascaded optimization paradigm to progressively regress the noise-free facet normals with multiple GCNs. GCN-Denoiser achieves the new state-of-the-art results in multiple noise datasets, including CAD models often containing sharp features and raw scan models with real noise captured from different devices. We also create a new dataset called PrintData containing 20 real scans with their corresponding ground-truth meshes for the research community. Our code and data are available in https://github.com/Jhonve/GCN-Denoiser.}, bibtype = {article}, author = {Shen, Yuefan and Fu, Hongbo and Du, Zhongshuo and Chen, Xiang and Burnaev, Evgeny and Zorin, Denis and Zhou, Kun and Zheng, Youyi}, doi = {10.1145/3480168}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {GAPointNet: Graph attention based point neural network for exploiting local feature of point cloud}, type = {article}, year = {2021}, keywords = {Attention pooling,Graph attention,Multiple heads mechanism,Point cloud,Semantic segmentation,Shape classification}, pages = {122-132}, volume = {438}, id = {7059fc2a-f6cf-37e5-8757-9b9cc7694e23}, created = {2021-09-27T07:36:18.512Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-30T06:29:44.030Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8}, private_publication = {false}, abstract = {Exploiting fine-grained semantic features on point cloud data is still challenging because of its irregular and sparse structure in a non-Euclidean space. In order to represent the local feature for each central point that is helpful towards better contextual learning, a max pooling operation is often used to highlight the most important feature in the local region. However, all other geometric local correlations between each central point and corresponding neighbourhood are ignored during the max pooling operation. To this end, the attention mechanism is promising in capturing node representation on graph-based data by attending over all the neighbouring nodes. In this paper, we propose a novel neural network for point cloud analysis, GAPointNet, which is able to learn local geometric representations by embedding graph attention mechanism within stacked Multi-Layer-Perceptron (MLP) layers. Specifically, we highlight different attention weights on the neighbourhood of each center point to efficiently exploit local features. We also combine attention features with local signature features generated by our attention pooling to fully extract local geometric structures and enhance the network robustness. The proposed GAPointNet architecture is tested on various benchmark datasets (i.e. ModelNet40, ShapeNet part, S3DIS, KITTI) and achieves state-of-the-art performance in both the shape classification and segmentation tasks.}, bibtype = {article}, author = {Chen, Can and Fragonara, Luca Zanotti and Tsourdos, Antonios}, doi = {10.1016/j.neucom.2021.01.095}, journal = {Neurocomputing} }
@article{ title = {Graph Attention Networks for Point Cloud Processing}, type = {article}, year = {2021}, id = {340c2f37-2d07-3d71-9794-ce2988a56200}, created = {2021-09-27T07:36:18.554Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:04.598Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Thakur, Sumesh and Scotia, Nova and Scotia, Nova and Thakur, Copyright Sumesh and Examiner, External}, number = {July} }
@article{ title = {GAPointNet: Graph attention based point neural network for exploiting local feature of point cloud}, type = {article}, year = {2021}, keywords = {Attention pooling,Graph attention,Multiple heads mechanism,Point cloud,Semantic segmentation,Shape classification}, pages = {122-132}, volume = {438}, id = {e4354bd5-85b1-3218-91c2-41db35603705}, created = {2021-09-28T11:26:08.757Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:02.335Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Exploiting fine-grained semantic features on point cloud data is still challenging because of its irregular and sparse structure in a non-Euclidean space. In order to represent the local feature for each central point that is helpful towards better contextual learning, a max pooling operation is often used to highlight the most important feature in the local region. However, all other geometric local correlations between each central point and corresponding neighbourhood are ignored during the max pooling operation. To this end, the attention mechanism is promising in capturing node representation on graph-based data by attending over all the neighbouring nodes. In this paper, we propose a novel neural network for point cloud analysis, GAPointNet, which is able to learn local geometric representations by embedding graph attention mechanism within stacked Multi-Layer-Perceptron (MLP) layers. Specifically, we highlight different attention weights on the neighbourhood of each center point to efficiently exploit local features. We also combine attention features with local signature features generated by our attention pooling to fully extract local geometric structures and enhance the network robustness. The proposed GAPointNet architecture is tested on various benchmark datasets (i.e. ModelNet40, ShapeNet part, S3DIS, KITTI) and achieves state-of-the-art performance in both the shape classification and segmentation tasks.}, bibtype = {article}, author = {Chen, Can and Fragonara, Luca Zanotti and Tsourdos, Antonios}, doi = {10.1016/j.neucom.2021.01.095}, journal = {Neurocomputing} }
@article{ title = {View-Aware Geometry-Structure Joint Learning for Single-View 3D Shape Reconstruction}, type = {article}, year = {2021}, keywords = {Geometry,Image reconstruction,Multimodal Learning,Periodic structures,Representation Learning,Shape,Single-View 3D Reconstruction,Solid modeling,Structure-Aware Reconstruction,Three-dimensional displays,Topology}, pages = {1-16}, volume = {8828}, publisher = {IEEE}, id = {e02172c6-db18-3f85-9b16-5f49378e0656}, created = {2021-09-29T10:16:08.708Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.052Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhang2021}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Reconstructing a 3D shape from a single-view image using deep learning is becoming increasingly popular recently. Most existing methods only focus on reconstructing the 3D shape geometry based on the image constraint. The lack of explicit modeling of structure relations among shape parts yields low-quality reconstruction results for structure-rich man-made shapes. In addition, conventional 2D-3D joint embedding architecture for image-based 3D shape reconstruction often omits the specific view information from the given image, which may lead to degraded geometry and structure reconstruction. We address these problems by introducing VGSNet, an encoder-decoder architecture for view-aware joint geometry and structure learning. The key idea is to jointly learn a multimodal feature representation of 2D image, 3D shape geometry and structure so that both geometry and structure details can be reconstructed from a single-view image. Therefore, we explicitly represent 3D shape structures as part relations and employ image supervision to guide the geometry and structure reconstruction. Trained with pairs of view-aligned images and 3D shapes, the VGSNet implicitly encodes the view-aware shape information in the latent feature space. Qualitative and quantitative comparisons with state-of-the-art baseline methods as well as ablation studies demonstrate the effectiveness of the VGSNet for structure-aware single-view 3D shape reconstruction.}, bibtype = {article}, author = {Zhang, Xuancheng and Ma, Rui and Zou, Changqing and Zhang, Minghao and Zhao, Xibin and Gao, Yue}, doi = {10.1109/TPAMI.2021.3090917}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {c} }
@article{ title = {Graph Spectral Point Cloud Processing}, type = {article}, year = {2021}, id = {b89b300c-f749-319a-b946-02db72e1a9f1}, created = {2021-09-30T06:29:37.911Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:04.883Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Electric, Mitsubishi} }
@article{ title = {Walk in the Cloud: Learning Curves for Point Clouds Shape Analysis}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.01288}, id = {3ef18a7b-1c3a-36d7-8b2f-bfec79f9a1e1}, created = {2021-10-13T14:40:10.857Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:05.142Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Discrete point cloud objects lack sufficient shape descriptors of 3D geometries. In this paper, we present a novel method for aggregating hypothetical curves in point clouds. Sequences of connected points (curves) are initially grouped by taking guided walks in the point clouds, and then subsequently aggregated back to augment their point-wise features. We provide an effective implementation of the proposed aggregation strategy including a novel curve grouping operator followed by a curve aggregation operator. Our method was benchmarked on several point cloud analysis tasks where we achieved the state-of-the-art classification accuracy of 94.2% on the ModelNet40 classification task, instance IoU of 86.8 on the ShapeNetPart segmentation task, and cosine error of 0.11 on the ModelNet40 normal estimation task.}, bibtype = {article}, author = {Xiang, Tiange and Zhang, Chaoyi and Song, Yang and Yu, Jianhui and Cai, Weidong} }
@article{ title = {Residual Attention: A Simple but Effective Method for Multi-Label Recognition}, type = {article}, year = {2021}, pages = {184-193}, websites = {http://arxiv.org/abs/2108.02456}, id = {5dbaf040-be45-3356-9305-5ff5fe77f9fd}, created = {2021-10-13T14:40:10.899Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:31.471Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Multi-label image recognition is a challenging computer vision task of practical use. Progresses in this area, however, are often characterized by complicated methods, heavy computations, and lack of intuitive explanations. To effectively capture different spatial regions occupied by objects from different categories, we propose an embarrassingly simple module, named class-specific residual attention (CSRA). CSRA generates class-specific features for every category by proposing a simple spatial attention score, and then combines it with the class-agnostic average pooling feature. CSRA achieves state-of-the-art results on multilabel recognition, and at the same time is much simpler than them. Furthermore, with only 4 lines of code, CSRA also leads to consistent improvement across many diverse pretrained models and datasets without any extra training. CSRA is both easy to implement and light in computations, which also enjoys intuitive explanations and visualizations.}, bibtype = {article}, author = {Zhu, Ke and Wu, Jianxin} }
@article{ title = {CvT: Introducing Convolutions to Vision Transformers}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.15808}, id = {1b0123e2-a5a9-3a9b-ad10-99f2ad23582b}, created = {2021-10-13T14:40:10.925Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:44:02.890Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0}, private_publication = {false}, abstract = {We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Transformer block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs) to the ViT architecture (\ie shift, scale, and distortion invariance) while maintaining the merits of Transformers (\ie dynamic attention, global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with fewer parameters and lower FLOPs. In addition, performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. Pre-trained on ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding, a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks. Code will be released at \urlhttps://github.com/leoxiaobin/CvT.}, bibtype = {article}, author = {Wu, Haiping and Xiao, Bin and Codella, Noel and Liu, Mengchen and Dai, Xiyang and Yuan, Lu and Zhang, Lei} }
@article{ title = {Unsupervised Learning of Fine Structure Generation for 3D Point Clouds by 2D Projection Matching}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2108.03746}, id = {d89c3431-6e58-3d4f-bd59-3679b6e37486}, created = {2021-10-13T14:40:10.931Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:07.228Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Learning to generate 3D point clouds without 3D supervision is an important but challenging problem. Current solutions leverage various differentiable renderers to project the generated 3D point clouds onto a 2D image plane, and train deep neural networks using the per-pixel difference with 2D ground truth images. However, these solutions are still struggling to fully recover fine structures of 3D shapes, such as thin tubes or planes. To resolve this issue, we propose an unsupervised approach for 3D point cloud generation with fine structures. Specifically, we cast 3D point cloud learning as a 2D projection matching problem. Rather than using entire 2D silhouette images as a regular pixel supervision, we introduce structure adaptive sampling to randomly sample 2D points within the silhouettes as an irregular point supervision, which alleviates the consistency issue of sampling from different view angles. Our method pushes the neural network to generate a 3D point cloud whose 2D projections match the irregular point supervision from different view angles. Our 2D projection matching approach enables the neural network to learn more accurate structure information than using the per-pixel difference, especially for fine and thin 3D structures. Our method can recover fine 3D structures from 2D silhouette images at different resolutions, and is robust to different sampling methods and point number in irregular point supervision. Our method outperforms others under widely used benchmarks. Our code, data and models are available at https://github.com/chenchao15/2D\_projection\_matching.}, bibtype = {article}, author = {Chao, Chen and Han, Zhizhong and Liu, Yu-Shen and Zwicker, Matthias}, number = {62072268} }
@article{ title = {Going deeper with Image Transformers}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.17239}, id = {cba87732-dece-395f-9797-82b349c9d5c7}, created = {2021-10-13T14:40:10.932Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:31.805Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Transformers have been recently adapted for large scale image classification, achieving high scores shaking up the long supremacy of convolutional neural networks. However the optimization of image transformers has been little studied so far. In this work, we build and optimize deeper transformer networks for image classification. In particular, we investigate the interplay of architecture and optimization of such dedicated transformers. We make two transformers architecture changes that significantly improve the accuracy of deep transformers. This leads us to produce models whose performance does not saturate early with more depth, for instance we obtain 86.5% top-1 accuracy on Imagenet when training with no external data, we thus attain the current SOTA with less FLOPs and parameters. Moreover, our best model establishes the new state of the art on Imagenet with Reassessed labels and Imagenet-V2 / match frequency, in the setting with no additional training data. We share our code and models.}, bibtype = {article}, author = {Touvron, Hugo and Cord, Matthieu and Sablayrolles, Alexandre and Synnaeve, Gabriel and Jégou, Hervé} }
@article{ title = {Spatial-Temporal Transformer for Dynamic Scene Graph Generation}, type = {article}, year = {2021}, pages = {16372-16382}, websites = {http://arxiv.org/abs/2107.12309}, id = {0d9d3c66-da3e-341c-9b83-386d384ea416}, created = {2021-10-13T14:40:11.023Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:18.086Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Dynamic scene graph generation aims at generating a scene graph of the given video. Compared to the task of scene graph generation from images, it is more challenging because of the dynamic relationships between objects and the temporal dependencies between frames allowing for a richer semantic interpretation. In this paper, we propose Spatial-temporal Transformer (STTran), a neural network that consists of two core modules: (1) a spatial encoder that takes an input frame to extract spatial context and reason about the visual relationships within a frame, and (2) a temporal decoder which takes the output of the spatial encoder as input in order to capture the temporal dependencies between frames and infer the dynamic relationships. Furthermore, STTran is flexible to take varying lengths of videos as input without clipping, which is especially important for long videos. Our method is validated on the benchmark dataset Action Genome (AG). The experimental results demonstrate the superior performance of our method in terms of dynamic scene graphs. Moreover, a set of ablative studies is conducted and the effect of each proposed module is justified. Code available at: https://github.com/yrcong/STTran.}, bibtype = {article}, author = {Cong, Yuren and Liao, Wentong and Ackermann, Hanno and Rosenhahn, Bodo and Yang, Michael Ying} }
@article{ title = {PointBA: Towards Backdoor Attacks in 3D Point Cloud}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.16074}, id = {e3a16f07-52d0-3aa8-990c-277a1a2a4232}, created = {2021-10-13T14:40:11.033Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:17.933Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {3D deep learning has been increasingly more popular for a variety of tasks including many safety-critical applications. However, recently several works raise the security issues of 3D deep models. Although most of them consider adversarial attacks, we identify that backdoor attack is indeed a more serious threat to 3D deep learning systems but remains unexplored. We present the backdoor attacks in 3D point cloud with a unified framework that exploits the unique properties of 3D data and networks. In particular, we design two attack approaches on point cloud: the poison-label backdoor attack (PointPBA) and the clean-label backdoor attack (PointCBA). The first one is straightforward and effective in practice, while the latter is more sophisticated assuming there are certain data inspections. The attack algorithms are mainly motivated and developed by 1) the recent discovery of 3D adversarial samples suggesting the vulnerability of deep models under spatial transformation; 2) the proposed feature disentanglement technique that manipulates the feature of the data through optimization methods and its potential to embed a new task. Extensive experiments show the efficacy of the PointPBA with over 95% success rate across various 3D datasets and models, and the more stealthy PointCBA with around 50% success rate. Our proposed backdoor attack in 3D point cloud is expected to perform as a baseline for improving the robustness of 3D deep models.}, bibtype = {article}, author = {Li, Xinke and Chen, Zhirui and Zhao, Yue and Tong, Zekun and Zhao, Yabang and Lim, Andrew and Zhou, Joey Tianyi} }
@article{ title = {Graph-to-3D: End-to-End Generation and Manipulation of 3D Scenes Using Scene Graphs}, type = {article}, year = {2021}, pages = {16352-16361}, websites = {http://arxiv.org/abs/2108.08841}, id = {ff9ef537-b273-35d3-9fd4-e04296b06190}, created = {2021-10-13T14:40:11.044Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:18.398Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Controllable scene synthesis consists of generating 3D information that satisfy underlying specifications. Thereby, these specifications should be abstract, i.e. allowing easy user interaction, whilst providing enough interface for detailed control. Scene graphs are representations of a scene, composed of objects (nodes) and inter-object relationships (edges), proven to be particularly suited for this task, as they allow for semantic control on the generated content. Previous works tackling this task often rely on synthetic data, and retrieve object meshes, which naturally limits the generation capabilities. To circumvent this issue, we instead propose the first work that directly generates shapes from a scene graph in an end-to-end manner. In addition, we show that the same model supports scene modification, using the respective scene graph as interface. Leveraging Graph Convolutional Networks (GCN) we train a variational Auto-Encoder on top of the object and edge categories, as well as 3D shapes and scene layouts, allowing latter sampling of new scenes and shapes.}, bibtype = {article}, author = {Dhamo, Helisa and Manhardt, Fabian and Navab, Nassir and Tombari, Federico} }
@article{ title = {Segmentation-grounded Scene Graph Generation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.14207}, id = {e7d07e54-b11c-3cbc-877e-ef9cca391182}, created = {2021-10-13T14:40:11.062Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:18.735Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Scene graph generation has emerged as an important problem in computer vision. While scene graphs provide a grounded representation of objects, their locations and relations in an image, they do so only at the granularity of proposal bounding boxes. In this work, we propose the first, to our knowledge, framework for pixel-level segmentation-grounded scene graph generation. Our framework is agnostic to the underlying scene graph generation method and address the lack of segmentation annotations in target scene graph datasets (e.g., Visual Genome) through transfer and multi-task learning from, and with, an auxiliary dataset (e.g., MS COCO). Specifically, each target object being detected is endowed with a segmentation mask, which is expressed as a lingual-similarity weighted linear combination over categories that have annotations present in an auxiliary dataset. These inferred masks, along with a novel Gaussian attention mechanism which grounds the relations at a pixel-level within the image, allow for improved relation prediction. The entire framework is end-to-end trainable and is learned in a multi-task manner with both target and auxiliary datasets.}, bibtype = {article}, author = {Khandelwal, Siddhesh and Suhail, Mohammed and Sigal, Leonid} }
@article{ title = {Unconditional Scene Graph Generation}, type = {article}, year = {2021}, pages = {16362-16371}, websites = {http://arxiv.org/abs/2108.05884}, id = {cffaf4de-fb45-3d89-b446-6f8e73114eca}, created = {2021-10-13T14:40:11.062Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:18.241Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Despite recent advancements in single-domain or single-object image generation, it is still challenging to generate complex scenes containing diverse, multiple objects and their interactions. Scene graphs, composed of nodes as objects and directed-edges as relationships among objects, offer an alternative representation of a scene that is more semantically grounded than images. We hypothesize that a generative model for scene graphs might be able to learn the underlying semantic structure of real-world scenes more effectively than images, and hence, generate realistic novel scenes in the form of scene graphs. In this work, we explore a new task for the unconditional generation of semantic scene graphs. We develop a deep auto-regressive model called SceneGraphGen which can directly learn the probability distribution over labelled and directed graphs using a hierarchical recurrent architecture. The model takes a seed object as input and generates a scene graph in a sequence of steps, each step generating an object node, followed by a sequence of relationship edges connecting to the previous nodes. We show that the scene graphs generated by SceneGraphGen are diverse and follow the semantic patterns of real-world scenes. Additionally, we demonstrate the application of the generated graphs in image synthesis, anomaly detection and scene graph completion.}, bibtype = {article}, author = {Garg, Sarthak and Dhamo, Helisa and Farshad, Azade and Musatian, Sabrina and Navab, Nassir and Tombari, Federico} }
@article{ title = {GANcraft: Unsupervised 3D Neural Rendering of Minecraft Worlds}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.07659}, id = {3efaa1d8-6031-38b9-a32a-4cc55e8fdfb4}, created = {2021-10-13T14:40:11.172Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:18.886Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present GANcraft, an unsupervised neural rendering framework for generating photorealistic images of large 3D block worlds such as those created in Minecraft. Our method takes a semantic block world as input, where each block is assigned a semantic label such as dirt, grass, or water. We represent the world as a continuous volumetric function and train our model to render view-consistent photorealistic images for a user-controlled camera. In the absence of paired ground truth real images for the block world, we devise a training technique based on pseudo-ground truth and adversarial training. This stands in contrast to prior work on neural rendering for view synthesis, which requires ground truth images to estimate scene geometry and view-dependent appearance. In addition to camera trajectory, GANcraft allows user control over both scene semantics and output style. Experimental results with comparison to strong baselines show the effectiveness of GANcraft on this novel task of photorealistic 3D block world synthesis. The project website is available at https://nvlabs.github.io/GANcraft/ .}, bibtype = {article}, author = {Hao, Zekun and Mallya, Arun and Belongie, Serge and Liu, Ming-Yu} }
@article{ title = {Planar Surface Reconstruction from Sparse Views}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.14644}, id = {2dd57164-5f56-39eb-bb4b-c0f5f1606df2}, created = {2021-10-13T14:40:11.177Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:19.435Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {The paper studies planar surface reconstruction of indoor scenes from two views with unknown camera poses. While prior approaches have successfully created object-centric reconstructions of many scenes, they fail to exploit other structures, such as planes, which are typically the dominant components of indoor scenes. In this paper, we reconstruct planar surfaces from multiple views, while jointly estimating camera pose. Our experiments demonstrate that our method is able to advance the state of the art of reconstruction from sparse views, on challenging scenes from Matterport3D. Project site: https://jinlinyi.github.io/SparsePlanes/}, bibtype = {article}, author = {Jin, Linyi and Qian, Shengyi and Owens, Andrew and Fouhey, David F.} }
@article{ title = {Sketch Your Own GAN}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2108.02774}, id = {1be78aff-1d64-398b-ab73-ee0656c5f4d6}, created = {2021-10-13T14:40:11.178Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:19.197Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Can a user create a deep generative model by sketching a single example? Traditionally, creating a GAN model has required the collection of a large-scale dataset of exemplars and specialized knowledge in deep learning. In contrast, sketching is possibly the most universally accessible way to convey a visual concept. In this work, we present a method, GAN Sketching, for rewriting GANs with one or more sketches, to make GANs training easier for novice users. In particular, we change the weights of an original GAN model according to user sketches. We encourage the model's output to match the user sketches through a cross-domain adversarial loss. Furthermore, we explore different regularization methods to preserve the original model's diversity and image quality. Experiments have shown that our method can mold GANs to match shapes and poses specified by sketches while maintaining realism and diversity. Finally, we demonstrate a few applications of the resulting GAN, including latent space interpolation and image editing.}, bibtype = {article}, author = {Wang, Sheng-Yu and Bau, David and Zhu, Jun-Yan} }
@article{ title = {Adaptive Surface Normal Constraint for Depth Estimation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.15483}, id = {e2ab6aa5-ef77-3a1b-8348-6c27a3fead03}, created = {2021-10-13T14:40:11.201Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:19.832Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present a novel method for single image depth estimation using surface normal constraints. Existing depth estimation methods either suffer from the lack of geometric constraints, or are limited to the difficulty of reliably capturing geometric context, which leads to a bottleneck of depth estimation quality. We therefore introduce a simple yet effective method, named Adaptive Surface Normal (ASN) constraint, to effectively correlate the depth estimation with geometric consistency. Our key idea is to adaptively determine the reliable local geometry from a set of randomly sampled candidates to derive surface normal constraint, for which we measure the consistency of the geometric contextual features. As a result, our method can faithfully reconstruct the 3D geometry and is robust to local shape variations, such as boundaries, sharp corners and noises. We conduct extensive evaluations and comparisons using public datasets. The experimental results demonstrate our method outperforms the state-of-the-art methods and has superior efficiency and robustness.}, bibtype = {article}, author = {Long, Xiaoxiao and Lin, Cheng and Liu, Lingjie and Li, Wei and Theobalt, Christian and Yang, Ruigang and Wang, Wenping} }
@article{ title = {Mesh Graphormer}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.00272}, id = {3d903cc1-e9e1-35de-b6f6-e39d37d92fad}, created = {2021-10-13T14:40:11.246Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:19.679Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present a graph-convolution-reinforced transformer, named Mesh Graphormer, for 3D human pose and mesh reconstruction from a single image. Recently both transformers and graph convolutional neural networks (GCNNs) have shown promising progress in human mesh reconstruction. Transformer-based approaches are effective in modeling non-local interactions among 3D mesh vertices and body joints, whereas GCNNs are good at exploiting neighborhood vertex interactions based on a pre-specified mesh topology. In this paper, we study how to combine graph convolutions and self-attentions in a transformer to model both local and global interactions. Experimental results show that our proposed method, Mesh Graphormer, significantly outperforms the previous state-of-the-art methods on multiple benchmarks, including Human3.6M, 3DPW, and FreiHAND datasets. Code and pre-trained models are available at https://github.com/microsoft/MeshGraphormer}, bibtype = {article}, author = {Lin, Kevin and Wang, Lijuan and Liu, Zicheng} }
@article{ title = {PoinTr: Diverse Point Cloud Completion with Geometry-Aware Transformers}, type = {article}, year = {2021}, pages = {12498-12507}, websites = {http://arxiv.org/abs/2108.08839}, id = {f19a6da0-7d56-38d8-afc2-c563317f608c}, created = {2021-10-13T14:40:11.330Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:19.980Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Point clouds captured in real-world applications are often incomplete due to the limited sensor resolution, single viewpoint, and occlusion. Therefore, recovering the complete point clouds from partial ones becomes an indispensable task in many practical applications. In this paper, we present a new method that reformulates point cloud completion as a set-to-set translation problem and design a new model, called PoinTr that adopts a transformer encoder-decoder architecture for point cloud completion. By representing the point cloud as a set of unordered groups of points with position embeddings, we convert the point cloud to a sequence of point proxies and employ the transformers for point cloud generation. To facilitate transformers to better leverage the inductive bias about 3D geometric structures of point clouds, we further devise a geometry-aware block that models the local geometric relationships explicitly. The migration of transformers enables our model to better learn structural knowledge and preserve detailed information for point cloud completion. Furthermore, we propose two more challenging benchmarks with more diverse incomplete point clouds that can better reflect the real-world scenarios to promote future research. Experimental results show that our method outperforms state-of-the-art methods by a large margin on both the new benchmarks and the existing ones. Code is available at https://github.com/yuxumin/PoinTr}, bibtype = {article}, author = {Yu, Xumin and Rao, Yongming and Wang, Ziyi and Liu, Zuyan and Lu, Jiwen and Zhou, Jie} }
@article{ title = {3DStyleNet: Creating 3D Shapes with Geometric and Texture Style Variations}, type = {article}, year = {2021}, pages = {12456-12465}, websites = {http://arxiv.org/abs/2108.12958}, id = {3d1cac07-297a-3da9-ad88-abb4ac1e3c4a}, created = {2021-10-13T14:40:11.337Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:20.152Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We propose a method to create plausible geometric and texture style variations of 3D objects in the quest to democratize 3D content creation. Given a pair of textured source and target objects, our method predicts a part-aware affine transformation field that naturally warps the source shape to imitate the overall geometric style of the target. In addition, the texture style of the target is transferred to the warped source object with the help of a multi-view differentiable renderer. Our model, 3DStyleNet, is composed of two sub-networks trained in two stages. First, the geometric style network is trained on a large set of untextured 3D shapes. Second, we jointly optimize our geometric style network and a pre-trained image style transfer network with losses defined over both the geometry and the rendering of the result. Given a small set of high-quality textured objects, our method can create many novel stylized shapes, resulting in effortless 3D content creation and style-ware data augmentation. We showcase our approach qualitatively on 3D content stylization, and provide user studies to validate the quality of our results. In addition, our method can serve as a valuable tool to create 3D data augmentations for computer vision tasks. Extensive quantitative analysis shows that 3DStyleNet outperforms alternative data augmentation techniques for the downstream task of single-image 3D reconstruction.}, bibtype = {article}, author = {Yin, Kangxue and Gao, Jun and Shugrina, Maria and Khamis, Sameh and Fidler, Sanja} }
@article{ title = {3DIAS: 3D Shape Reconstruction with Implicit Algebraic Surfaces}, type = {article}, year = {2021}, pages = {12446-12455}, websites = {http://arxiv.org/abs/2108.08653}, id = {1aeb822c-b86e-33d3-a592-8d8b50af9a33}, created = {2021-10-13T14:40:11.343Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:20.301Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {3D Shape representation has substantial effects on 3D shape reconstruction. Primitive-based representations approximate a 3D shape mainly by a set of simple implicit primitives, but the low geometrical complexity of the primitives limits the shape resolution. Moreover, setting a sufficient number of primitives for an arbitrary shape is challenging. To overcome these issues, we propose a constrained implicit algebraic surface as the primitive with few learnable coefficients and higher geometrical complexities and a deep neural network to produce these primitives. Our experiments demonstrate the superiorities of our method in terms of representation power compared to the state-of-the-art methods in single RGB image 3D shape reconstruction. Furthermore, we show that our method can semantically learn segments of 3D shapes in an unsupervised manner. The code is publicly available from https://myavartanoo.github.io/3dias/ .}, bibtype = {article}, author = {Yavartanoo, Mohsen and Chung, JaeYoung and Neshatavar, Reyhaneh and Lee, Kyoung Mu} }
@article{ title = {Unsupervised Point Cloud Object Co-segmentation by Co-contrastive Learning and Mutual Attention Sampling}, type = {article}, year = {2021}, pages = {1-10}, id = {d61b01e7-789c-3b21-84fc-e0f31111b26d}, created = {2021-10-13T14:40:11.366Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:07.487Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Cvpr, Anonymous and Id, Paper} }
@article{ title = {Manifold Matching via Deep Metric Learning for Generative Modeling}, type = {article}, year = {2021}, pages = {6587-6597}, websites = {http://arxiv.org/abs/2106.10777}, id = {8f08bcac-d408-351b-bfff-82a27458514e}, created = {2021-10-13T14:40:11.402Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:23.464Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We propose a manifold matching approach to generative models which includes a distribution generator (or data generator) and a metric generator. In our framework, we view the real data set as some manifold embedded in a high-dimensional Euclidean space. The distribution generator aims at generating samples that follow some distribution condensed around the real data manifold. It is achieved by matching two sets of points using their geometric shape descriptors, such as centroid and $p$-diameter, with learned distance metric; the metric generator utilizes both real data and generated samples to learn a distance metric which is close to some intrinsic geodesic distance on the real data manifold. The produced distance metric is further used for manifold matching. The two networks are learned simultaneously during the training process. We apply the approach on both unsupervised and supervised learning tasks: in unconditional image generation task, the proposed method obtains competitive results compared with existing generative models; in super-resolution task, we incorporate the framework in perception-based models and improve visual qualities by producing samples with more natural textures. Both theoretical analysis and real data experiments demonstrate the feasibility and effectiveness of the proposed framework.}, bibtype = {article}, author = {Dai, Mengyu and Hang, Haibin} }
@article{ title = {When do GANs replicate ? On the choice of dataset size}, type = {article}, year = {2021}, pages = {6701-6710}, id = {c3650cca-3fd2-380f-b8a8-585f84e855ba}, created = {2021-10-13T14:40:11.465Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-19T07:34:03.926Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,a6a80a30-e9a2-486d-8032-eac3fd981996,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Feng, Qianli and Benitez-quiroz, Chenqi Guo Fabian} }
@article{ title = {Spatio-temporal Self-Supervised Representation Learning for 3D Point Clouds}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2109.00179}, id = {61246011-bcfb-3c36-8207-8aba7ed96b0e}, created = {2021-10-13T14:40:11.479Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:23.618Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {To date, various 3D scene understanding tasks still lack practical and generalizable pre-trained models, primarily due to the intricate nature of 3D scene understanding tasks and their immense variations introduced by camera views, lighting, occlusions, etc. In this paper, we tackle this challenge by introducing a spatio-temporal representation learning (STRL) framework, capable of learning from unlabeled 3D point clouds in a self-supervised fashion. Inspired by how infants learn from visual data in the wild, we explore the rich spatio-temporal cues derived from the 3D data. Specifically, STRL takes two temporally-correlated frames from a 3D point cloud sequence as the input, transforms it with the spatial data augmentation, and learns the invariant representation self-supervisedly. To corroborate the efficacy of STRL, we conduct extensive experiments on three types (synthetic, indoor, and outdoor) of datasets. Experimental results demonstrate that, compared with supervised learning methods, the learned self-supervised representation facilitates various models to attain comparable or even better performances while capable of generalizing pre-trained models to downstream tasks, including 3D shape classification, 3D object detection, and 3D semantic segmentation. Moreover, the spatio-temporal contextual cues embedded in 3D point clouds significantly improve the learned representations.}, bibtype = {article}, author = {Huang, Siyuan and Xie, Yichen and Zhu, Song-Chun and Zhu, Yixin} }
@article{ title = {Vis2Mesh: Efficient Mesh Reconstruction from Unstructured Point Clouds of Large Scenes with Learned Virtual View Visibility}, type = {article}, year = {2021}, pages = {6514-6524}, websites = {http://arxiv.org/abs/2108.08378}, id = {f6da9524-1a84-3683-a414-cd8db7be64ef}, created = {2021-10-13T14:40:11.499Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:54.845Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present a novel framework for mesh reconstruction from unstructured point clouds by taking advantage of the learned visibility of the 3D points in the virtual views and traditional graph-cut based mesh generation. Specifically, we first propose a three-step network that explicitly employs depth completion for visibility prediction. Then the visibility information of multiple views is aggregated to generate a 3D mesh model by solving an optimization problem considering visibility in which a novel adaptive visibility weighting in surface determination is also introduced to suppress line of sight with a large incident angle. Compared to other learning-based approaches, our pipeline only exercises the learning on a 2D binary classification task, \ie, points visible or not in a view, which is much more generalizable and practically more efficient and capable to deal with a large number of points. Experiments demonstrate that our method with favorable transferability and robustness, and achieve competing performances \wrt state-of-the-art learning-based approaches on small complex objects and outperforms on large indoor and outdoor scenes. Code is available at https://github.com/GDAOSU/vis2mesh.}, bibtype = {article}, author = {Song, Shuang and Cui, Zhaopeng and Qin, Rongjun} }
@article{ title = {Learning Signed Distance Field for Multi-view Surface Reconstruction}, type = {article}, year = {2021}, pages = {6525-6534}, websites = {http://arxiv.org/abs/2108.09964}, id = {ac722251-2f3d-3c01-b4be-5c16ae6a67fe}, created = {2021-10-13T14:40:11.504Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:23.759Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Recent works on implicit neural representations have shown promising results for multi-view surface reconstruction. However, most approaches are limited to relatively simple geometries and usually require clean object masks for reconstructing complex and concave objects. In this work, we introduce a novel neural surface reconstruction framework that leverages the knowledge of stereo matching and feature consistency to optimize the implicit surface representation. More specifically, we apply a signed distance field (SDF) and a surface light field to represent the scene geometry and appearance respectively. The SDF is directly supervised by geometry from stereo matching, and is refined by optimizing the multi-view feature consistency and the fidelity of rendered images. Our method is able to improve the robustness of geometry estimation and support reconstruction of complex scene topologies. Extensive experiments have been conducted on DTU, EPFL and Tanks and Temples datasets. Compared to previous state-of-the-art methods, our method achieves better mesh reconstruction in wide open scenes without masks as input.}, bibtype = {article}, author = {Zhang, Jingyang and Yao, Yao and Quan, Long} }
@article{ title = {Multi-view 3D Reconstruction with Transformer}, type = {article}, year = {2021}, pages = {1-14}, websites = {http://arxiv.org/abs/2103.12957}, id = {6b0820aa-5571-3399-9548-13843ce867f0}, created = {2021-10-13T14:40:11.593Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.165Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2021}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b,cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {Deep CNN-based methods have so far achieved the state of the art results in multi-view 3D object reconstruction. Despite the considerable progress, the two core modules of these methods - multi-view feature extraction and fusion, are usually investigated separately, and the object relations in different views are rarely explored. In this paper, inspired by the recent great success in self-attention-based Transformer models, we reformulate the multi-view 3D reconstruction as a sequence-to-sequence prediction problem and propose a new framework named 3D Volume Transformer (VolT) for such a task. Unlike previous CNN-based methods using a separate design, we unify the feature extraction and view fusion in a single Transformer network. A natural advantage of our design lies in the exploration of view-to-view relationships using self-attention among multiple unordered inputs. On ShapeNet - a large-scale 3D reconstruction benchmark dataset, our method achieves a new state-of-the-art accuracy in multi-view reconstruction with fewer parameters ($70\%$ less) than other CNN-based methods. Experimental results also suggest the strong scaling capability of our method. Our code will be made publicly available.}, bibtype = {article}, author = {Wang, Dan and Cui, Xinrui and Chen, Xun and Zou, Zhengxia and Shi, Tianyang and Salcudean, Septimiu and Wang, Z. Jane and Ward, Rabab} }
@article{ title = {Adaptive Graph Convolution for Point Cloud Analysis}, type = {article}, year = {2021}, pages = {4965-4974}, websites = {http://arxiv.org/abs/2108.08035}, id = {9888bd0b-81e0-3872-9e2c-e7245e07e20f}, created = {2021-10-13T14:40:11.612Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:06.311Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Convolution on 3D point clouds that generalized from 2D grid-like domains is widely researched yet far from perfect. The standard convolution characterises feature correspondences indistinguishably among 3D points, presenting an intrinsic limitation of poor distinctive feature learning. In this paper, we propose Adaptive Graph Convolution (AdaptConv) which generates adaptive kernels for points according to their dynamically learned features. Compared with using a fixed/isotropic kernel, AdaptConv improves the flexibility of point cloud convolutions, effectively and precisely capturing the diverse relations between points from different semantic parts. Unlike popular attentional weight schemes, the proposed AdaptConv implements the adaptiveness inside the convolution operation instead of simply assigning different weights to the neighboring points. Extensive qualitative and quantitative evaluations show that our method outperforms state-of-the-art point cloud classification and segmentation approaches on several benchmark datasets. Our code is available at https://github.com/hrzhou2/AdaptConv-master.}, bibtype = {article}, author = {Zhou, Haoran and Feng, Yidan and Fang, Mingsheng and Wei, Mingqiang and Qin, Jing and Lu, Tong} }
@article{ title = {Deep Structured Instance Graph for Distilling Object Detectors}, type = {article}, year = {2021}, pages = {4359-4368}, websites = {http://arxiv.org/abs/2109.12862}, id = {cc3215d0-97c8-3b74-a43c-a9626dd0556d}, created = {2021-10-13T14:40:11.622Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:24.738Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Effectively structuring deep knowledge plays a pivotal role in transfer from teacher to student, especially in semantic vision tasks. In this paper, we present a simple knowledge structure to exploit and encode information inside the detection system to facilitate detector knowledge distillation. Specifically, aiming at solving the feature imbalance problem while further excavating the missing relation inside semantic instances, we design a graph whose nodes correspond to instance proposal-level features and edges represent the relation between nodes. To further refine this graph, we design an adaptive background loss weight to reduce node noise and background samples mining to prune trivial edges. We transfer the entire graph as encoded knowledge representation from teacher to student, capturing local and global information simultaneously. We achieve new state-of-the-art results on the challenging COCO object detection task with diverse student-teacher pairs on both one- and two-stage detectors. We also experiment with instance segmentation to demonstrate robustness of our method. It is notable that distilled Faster R-CNN with ResNet18-FPN and ResNet50-FPN yields 38.68 and 41.82 Box AP respectively on the COCO benchmark, Faster R-CNN with ResNet101-FPN significantly achieves 43.38 AP, which outperforms ResNet152-FPN teacher about 0.7 AP. Code: https://github.com/dvlab-research/Dsig.}, bibtype = {article}, author = {Chen, Yixin and Chen, Pengguang and Liu, Shu and Wang, Liwei and Jia, Jiaya} }
@article{ title = {Dynamic Attentive Graph Learning for Image Restoration}, type = {article}, year = {2021}, pages = {4328-4337}, websites = {http://arxiv.org/abs/2109.06620}, id = {bd1667fa-4ecf-301a-8442-f19394f0d11f}, created = {2021-10-13T14:40:11.629Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:06.052Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Non-local self-similarity in natural images has been verified to be an effective prior for image restoration. However, most existing deep non-local methods assign a fixed number of neighbors for each query item, neglecting the dynamics of non-local correlations. Moreover, the non-local correlations are usually based on pixels, prone to be biased due to image degradation. To rectify these weaknesses, in this paper, we propose a dynamic attentive graph learning model (DAGL) to explore the dynamic non-local property on patch level for image restoration. Specifically, we propose an improved graph model to perform patch-wise graph convolution with a dynamic and adaptive number of neighbors for each node. In this way, image content can adaptively balance over-smooth and over-sharp artifacts through the number of its connected neighbors, and the patch-wise non-local correlations can enhance the message passing process. Experimental results on various image restoration tasks: synthetic image denoising, real image denoising, image demosaicing, and compression artifact reduction show that our DAGL can produce state-of-the-art results with superior accuracy and visual quality. The source code is available at https://github.com/jianzhangcs/DAGL.}, bibtype = {article}, author = {Mou, Chong and Zhang, Jian and Wu, Zhuoyuan} }
@article{ title = {RGB-D Saliency Detection via Cascaded Mutual Information Minimization}, type = {article}, year = {2021}, pages = {4338-4347}, websites = {http://arxiv.org/abs/2109.07246}, id = {3d2fc1af-57f4-35d7-92c8-f76854e5157e}, created = {2021-10-13T14:40:11.643Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:55.447Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Existing RGB-D saliency detection models do not explicitly encourage RGB and depth to achieve effective multi-modal learning. In this paper, we introduce a novel multi-stage cascaded learning framework via mutual information minimization to "explicitly" model the multi-modal information between RGB image and depth data. Specifically, we first map the feature of each mode to a lower dimensional feature vector, and adopt mutual information minimization as a regularizer to reduce the redundancy between appearance features from RGB and geometric features from depth. We then perform multi-stage cascaded learning to impose the mutual information minimization constraint at every stage of the network. Extensive experiments on benchmark RGB-D saliency datasets illustrate the effectiveness of our framework. Further, to prosper the development of this field, we contribute the largest (7x larger than NJU2K) dataset, which contains 15,625 image pairs with high quality polygon-/scribble-/object-/instance-/rank-level annotations. Based on these rich labels, we additionally construct four new benchmarks with strong baselines and observe some interesting phenomena, which can motivate future model design. Source code and dataset are available at "https://github.com/JingZhang617/cascaded_rgbd_sod".}, bibtype = {article}, author = {Zhang, Jing and Fan, Deng-Ping and Dai, Yuchao and Yu, Xin and Zhong, Yiran and Barnes, Nick and Shao, Ling} }
@article{ title = {Topologically Consistent Multi-View Face Inference Using Volumetric Sampling}, type = {article}, year = {2021}, pages = {3824-3834}, id = {301f8b0c-cb4f-3dfd-acaf-67a294aa8bc2}, created = {2021-10-13T14:40:11.731Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:25.216Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {High-fidelity face digitization solutions often combine multi-view stereo (MVS) techniques for 3D reconstruction and a non-rigid registration step to establish dense correspondence across identities and expressions. A common problem is the need for manual clean-up after the MVS step, as 3D scans are typically affected by noise and outliers and contain hairy surface regions that need to be cleaned up by artists. Furthermore, mesh registration tends to fail for extreme facial expressions. Most learning-based methods use an underlying 3D morphable model (3DMM) to ensure robustness, but this limits the output accuracy for extreme facial expressions. In addition, the global bottleneck of regression architectures cannot produce meshes that tightly fit the ground truth surfaces. We propose ToFu, Topologically consistent Face from multi-view, a geometry inference framework that can produce topologically consistent meshes across facial identities and expressions using a volumetric representation instead of an explicit underlying 3DMM. Our novel progressive mesh generation network embeds the topological structure of the face in a feature volume, sampled from geometry-aware local features. A coarse-to-fine architecture facilitates dense and accurate facial mesh predictions in a consistent mesh topology. ToFu further captures displacement maps for pore-level geometric details and facilitates high-quality rendering in the form of albedo and specular reflectance maps. These high-quality assets are readily usable by production studios for avatar creation, animation and physically-based skin rendering. We demonstrate state-of-the-art geometric and correspondence accuracy, while only taking 0.385 seconds to compute a mesh with 10K vertices, which is three orders of magnitude faster than traditional techniques. The code and the model are available for research purposes at https://tianyeli.github.io/tofu.}, bibtype = {article}, author = {Iccv, Anonymous and Id, Paper} }
@article{ title = {Point-set Distances for Learning Representations of 3D Point Clouds}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2102.04014}, id = {bbd9f1a0-06c0-3aa2-a353-493a45c07efb}, created = {2021-10-13T14:40:11.744Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:06.953Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Learning an effective representation of 3D point clouds requires a good metric to measure the discrepancy between two 3D point sets, which is non-trivial due to their irregularity. Most of the previous works resort to using the Chamfer discrepancy or Earth Mover's distance, but those metrics are either ineffective in measuring the differences between point clouds or computationally expensive. In this paper, we conduct a systematic study with extensive experiments on distance metrics for 3D point clouds. From this study, we propose to use a variant of the Wasserstein distance, named the sliced Wasserstein distance, for learning representations of 3D point clouds. Experiments show that the sliced Wasserstein distance allows the neural network to learn a more efficient representation compared to the Chamfer discrepancy. We demonstrate the efficiency of the sliced Wasserstein metric on several tasks in 3D computer vision including training a point cloud autoencoder, generative modeling, transfer learning, and point cloud registration.}, bibtype = {article}, author = {Nguyen, Trung and Pham, Quang-Hieu and Le, Tam and Pham, Tung and Ho, Nhat and Hua, Binh-Son}, number = {Section 4} }
@article{ title = {Self-Supervised Pretraining of 3D Features on any Point-Cloud}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2101.02691}, id = {f0c3c8a4-22f8-3ae0-be06-ca2c470a5b0b}, created = {2021-10-13T14:40:11.759Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:20.924Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Pretraining on large labeled datasets is a prerequisite to achieve good performance in many computer vision tasks like 2D object recognition, video classification etc. However, pretraining is not widely used for 3D recognition tasks where state-of-the-art methods train models from scratch. A primary reason is the lack of large annotated datasets because 3D data is both difficult to acquire and time consuming to label. We present a simple self-supervised pertaining method that can work with any 3D data - single or multiview, indoor or outdoor, acquired by varied sensors, without 3D registration. We pretrain standard point cloud and voxel based model architectures, and show that joint pretraining further improves performance. We evaluate our models on 9 benchmarks for object detection, semantic segmentation, and object classification, where they achieve state-of-the-art results and can outperform supervised pretraining. We set a new state-of-the-art for object detection on ScanNet (69.0% mAP) and SUNRGBD (63.5% mAP). Our pretrained models are label efficient and improve performance for classes with few examples.}, bibtype = {article}, author = {Zhang, Zaiwei and Girdhar, Rohit and Joulin, Armand and Misra, Ishan} }
@article{ title = {Active Learning for Deep Object Detection via Probabilistic Modeling}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.16130}, id = {f675e0ad-60ea-3e42-840e-1d41266ca4f6}, created = {2021-10-13T14:40:11.762Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:20.766Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Active learning aims to reduce labeling costs by selecting only the most informative samples on a dataset. Few existing works have addressed active learning for object detection. Most of these methods are based on multiple models or are straightforward extensions of classification methods, hence estimate an image's informativeness using only the classification head. In this paper, we propose a novel deep active learning approach for object detection. Our approach relies on mixture density networks that estimate a probabilistic distribution for each localization and classification head's output. We explicitly estimate the aleatoric and epistemic uncertainty in a single forward pass of a single model. Our method uses a scoring function that aggregates these two types of uncertainties for both heads to obtain every image's informativeness score. We demonstrate the efficacy of our approach in PASCAL VOC and MS-COCO datasets. Our approach outperforms single-model based methods and performs on par with multi-model based methods at a fraction of the computing cost.}, bibtype = {article}, author = {Choi, Jiwoong and Elezi, Ismail and Lee, Hyuk-Jae and Farabet, Clement and Alvarez, Jose M.} }
@article{ title = {Common Objects in 3D: Large-Scale Learning and Evaluation of Real-life 3D Category Reconstruction}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2109.00512}, id = {9ac6fc54-d9a0-35cd-8d4f-efef3e6919ca}, created = {2021-10-13T14:40:11.765Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:54.645Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Traditional approaches for learning 3D object categories have been predominantly trained and evaluated on synthetic datasets due to the unavailability of real 3D-annotated category-centric data. Our main goal is to facilitate advances in this field by collecting real-world data in a magnitude similar to the existing synthetic counterparts. The principal contribution of this work is thus a large-scale dataset, called Common Objects in 3D, with real multi-view images of object categories annotated with camera poses and ground truth 3D point clouds. The dataset contains a total of 1.5 million frames from nearly 19,000 videos capturing objects from 50 MS-COCO categories and, as such, it is significantly larger than alternatives both in terms of the number of categories and objects. We exploit this new dataset to conduct one of the first large-scale "in-the-wild" evaluations of several new-view-synthesis and category-centric 3D reconstruction methods. Finally, we contribute NerFormer - a novel neural rendering method that leverages the powerful Transformer to reconstruct an object given a small number of its views. The CO3D dataset is available at https://github.com/facebookresearch/co3d .}, bibtype = {article}, author = {Reizenstein, Jeremy and Shapovalov, Roman and Henzler, Philipp and Sbordone, Luca and Labatut, Patrick and Novotny, David} }
@article{ title = {Exploiting a Joint Embedding Space for Generalized Zero-Shot Semantic Segmentation}, type = {article}, year = {2021}, pages = {9536-9545}, websites = {http://arxiv.org/abs/2108.06536}, id = {e8d65bfb-9709-374c-9143-3290046630f9}, created = {2021-10-13T14:40:11.833Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:21.237Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We address the problem of generalized zero-shot semantic segmentation (GZS3) predicting pixel-wise semantic labels for seen and unseen classes. Most GZS3 methods adopt a generative approach that synthesizes visual features of unseen classes from corresponding semantic ones (e.g., word2vec) to train novel classifiers for both seen and unseen classes. Although generative methods show decent performance, they have two limitations: (1) the visual features are biased towards seen classes; (2) the classifier should be retrained whenever novel unseen classes appear. We propose a discriminative approach to address these limitations in a unified framework. To this end, we leverage visual and semantic encoders to learn a joint embedding space, where the semantic encoder transforms semantic features to semantic prototypes that act as centers for visual features of corresponding classes. Specifically, we introduce boundary-aware regression (BAR) and semantic consistency (SC) losses to learn discriminative features. Our approach to exploiting the joint embedding space, together with BAR and SC terms, alleviates the seen bias problem. At test time, we avoid the retraining process by exploiting semantic prototypes as a nearest-neighbor (NN) classifier. To further alleviate the bias problem, we also propose an inference technique, dubbed Apollonius calibration (AC), that modulates the decision boundary of the NN classifier to the Apollonius circle adaptively. Experimental results demonstrate the effectiveness of our framework, achieving a new state of the art on standard benchmarks.}, bibtype = {article}, author = {Baek, Donghyeon and Oh, Youngmin and Ham, Bumsub} }
@article{ title = {Adaptive Adversarial Network for Source-free Domain Adaptation}, type = {article}, year = {2021}, pages = {9010-9019}, id = {8dbe0c8b-629f-3d1c-bc7c-a408661ad59d}, created = {2021-10-13T14:40:11.882Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:21.423Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Iccv, Anonymous and Id, Paper} }
@article{ title = {Simpler is Better: Few-shot Semantic Segmentation with Classifier Weight Transformer}, type = {article}, year = {2021}, pages = {8741-8750}, websites = {http://arxiv.org/abs/2108.03032}, id = {2e54dbd3-7d01-33e9-a1fd-03be62a55fc5}, created = {2021-10-13T14:40:11.900Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:21.761Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {A few-shot semantic segmentation model is typically composed of a CNN encoder, a CNN decoder and a simple classifier (separating foreground and background pixels). Most existing methods meta-learn all three model components for fast adaptation to a new class. However, given that as few as a single support set image is available, effective model adaption of all three components to the new class is extremely challenging. In this work we propose to simplify the meta-learning task by focusing solely on the simplest component, the classifier, whilst leaving the encoder and decoder to pre-training. We hypothesize that if we pre-train an off-the-shelf segmentation model over a set of diverse training classes with sufficient annotations, the encoder and decoder can capture rich discriminative features applicable for any unseen classes, rendering the subsequent meta-learning stage unnecessary. For the classifier meta-learning, we introduce a Classifier Weight Transformer (CWT) designed to dynamically adapt the supportset trained classifier's weights to each query image in an inductive way. Extensive experiments on two standard benchmarks show that despite its simplicity, our method outperforms the state-of-the-art alternatives, often by a large margin.Code is available on https://github.com/zhiheLu/CWT-for-FSS.}, bibtype = {article}, author = {Lu, Zhihe and He, Sen and Zhu, Xiatian and Zhang, Li and Song, Yi-Zhe and Xiang, Tao} }
@article{ title = {DeFRCN: Decoupled Faster R-CNN for Few-Shot Object Detection}, type = {article}, year = {2021}, pages = {8681-8690}, websites = {http://arxiv.org/abs/2108.09017}, id = {bcce9b6a-8b07-3de4-91d8-5ed66baee5af}, created = {2021-10-13T14:40:11.995Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:22.078Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Few-shot object detection, which aims at detecting novel objects rapidly from extremely few annotated examples of previously unseen classes, has attracted significant research interest in the community. Most existing approaches employ the Faster R-CNN as basic detection framework, yet, due to the lack of tailored considerations for data-scarce scenario, their performance is often not satisfactory. In this paper, we look closely into the conventional Faster R-CNN and analyze its contradictions from two orthogonal perspectives, namely multi-stage (RPN vs. RCNN) and multi-task (classification vs. localization). To resolve these issues, we propose a simple yet effective architecture, named Decoupled Faster R-CNN (DeFRCN). To be concrete, we extend Faster R-CNN by introducing Gradient Decoupled Layer for multi-stage decoupling and Prototypical Calibration Block for multi-task decoupling. The former is a novel deep layer with redefining the feature-forward operation and gradient-backward operation for decoupling its subsequent layer and preceding layer, and the latter is an offline prototype-based classification model with taking the proposals from detector as input and boosting the original classification scores with additional pairwise scores for calibration. Extensive experiments on multiple benchmarks show our framework is remarkably superior to other existing approaches and establishes a new state-of-the-art in few-shot literature.}, bibtype = {article}, author = {Qiao, Limeng and Zhao, Yuxuan and Li, Zhiyuan and Qiu, Xi and Wu, Jianan and Zhang, Chi} }
@article{ title = {Fooling LiDAR Perception via Adversarial Trajectory Perturbation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.15326}, id = {bde982d3-c47c-3f7d-80ed-e3e77a738e95}, created = {2021-10-13T14:40:12.005Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:22.396Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {LiDAR point clouds collected from a moving vehicle are functions of its trajectories, because the sensor motion needs to be compensated to avoid distortions. When autonomous vehicles are sending LiDAR point clouds to deep networks for perception and planning, could the motion compensation consequently become a wide-open backdoor in those networks, due to both the adversarial vulnerability of deep learning and GPS-based vehicle trajectory estimation that is susceptible to wireless spoofing? We demonstrate such possibilities for the first time: instead of directly attacking point cloud coordinates which requires tampering with the raw LiDAR readings, only adversarial spoofing of a self-driving car's trajectory with small perturbations is enough to make safety-critical objects undetectable or detected with incorrect positions. Moreover, polynomial trajectory perturbation is developed to achieve a temporally-smooth and highly-imperceptible attack. Extensive experiments on 3D object detection have shown that such attacks not only lower the performance of the state-of-the-art detectors effectively, but also transfer to other detectors, raising a red flag for the community. The code is available on https://ai4ce.github.io/FLAT/.}, bibtype = {article}, author = {Li, Yiming and Wen, Congcong and Juefei-Xu, Felix and Feng, Chen} }
@article{ title = {Differentiable Convolution Search for Point Cloud Processing}, type = {article}, year = {2021}, pages = {7437-7446}, websites = {http://arxiv.org/abs/2108.12856}, id = {93444432-2744-30a2-aa10-b6f2000db176}, created = {2021-10-13T14:40:12.027Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T12:52:20.641Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Exploiting convolutional neural networks for point cloud processing is quite challenging, due to the inherent irregular distribution and discrete shape representation of point clouds. To address these problems, many handcrafted convolution variants have sprung up in recent years. Though with elaborate design, these variants could be far from optimal in sufficiently capturing diverse shapes formed by discrete points. In this paper, we propose PointSeaConv, i.e., a novel differential convolution search paradigm on point clouds. It can work in a purely data-driven manner and thus is capable of auto-creating a group of suitable convolutions for geometric shape modeling. We also propose a joint optimization framework for simultaneous search of internal convolution and external architecture, and introduce epsilon-greedy algorithm to alleviate the effect of discretization error. As a result, PointSeaNet, a deep network that is sufficient to capture geometric shapes at both convolution level and architecture level, can be searched out for point cloud processing. Extensive experiments strongly evidence that our proposed PointSeaNet surpasses current handcrafted deep models on challenging benchmarks across multiple tasks with remarkable margins.}, bibtype = {article}, author = {Nie, Xing and Liu, Yongcheng and Chen, Shaohong and Chang, Jianlong and Huo, Chunlei and Meng, Gaofeng and Tian, Qi and Hu, Weiming and Pan, Chunhong} }
@article{ title = {PR-GCN: A Deep Graph Convolutional Network with Point Refinement for 6D Pose Estimation}, type = {article}, year = {2021}, pages = {2793-2802}, websites = {http://arxiv.org/abs/2108.09916}, id = {a9715c52-ae0a-3a9e-9962-8976d6b1a64e}, created = {2021-10-13T14:40:12.081Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:05.454Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {RGB-D based 6D pose estimation has recently achieved remarkable progress, but still suffers from two major limitations: (1) ineffective representation of depth data and (2) insufficient integration of different modalities. This paper proposes a novel deep learning approach, namely Graph Convolutional Network with Point Refinement (PR-GCN), to simultaneously address the issues above in a unified way. It first introduces the Point Refinement Network (PRN) to polish 3D point clouds, recovering missing parts with noise removed. Subsequently, the Multi-Modal Fusion Graph Convolutional Network (MMF-GCN) is presented to strengthen RGB-D combination, which captures geometry-aware inter-modality correlation through local information propagation in the graph convolutional network. Extensive experiments are conducted on three widely used benchmarks, and state-of-the-art performance is reached. Besides, it is also shown that the proposed PRN and MMF-GCN modules are well generalized to other frameworks.}, bibtype = {article}, author = {Zhou, Guangyuan and Wang, Huiqun and Chen, Jiaxin and Huang, Di} }
@article{ title = {Weakly Supervised 3D Semantic Segmentation Using Cross-Image Consensus and Inter-Voxel Affinity Relations}, type = {article}, year = {2021}, pages = {2834-2844}, id = {3eaf10ef-e4f7-31b7-a88c-7e15c1fe7e14}, created = {2021-10-13T14:40:12.132Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:28.126Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Cvpr, Anonymous and Id, Paper} }
@article{ title = {SGPA : Structure-Guided Prior Adaptation for Category-Level 6D Object Pose Estimation}, type = {article}, year = {2021}, pages = {2773-2782}, id = {229a2413-e909-3abd-bdf0-4f70debed722}, created = {2021-10-13T14:40:12.150Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:28.585Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Iccv, Iccv} }
@article{ title = {HPNet: Deep Primitive Segmentation Using Hybrid Representations}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.10620}, id = {9ba1f30f-09a1-3231-867f-fecdd4f639cd}, created = {2021-10-13T14:40:12.169Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T12:52:20.633Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {This paper introduces HPNet, a novel deep-learning approach for segmenting a 3D shape represented as a point cloud into primitive patches. The key to deep primitive segmentation is learning a feature representation that can separate points of different primitives. Unlike utilizing a single feature representation, HPNet leverages hybrid representations that combine one learned semantic descriptor, two spectral descriptors derived from predicted geometric parameters, as well as an adjacency matrix that encodes sharp edges. Moreover, instead of merely concatenating the descriptors, HPNet optimally combines hybrid representations by learning combination weights. This weighting module builds on the entropy of input features. The output primitive segmentation is obtained from a mean-shift clustering module. Experimental results on benchmark datasets ANSI and ABCParts show that HPNet leads to significant performance gains from baseline approaches.}, bibtype = {article}, author = {Yan, Siming and Yang, Zhenpei and Ma, Chongyang and Huang, Haibin and Vouga, Etienne and Huang, Qixing} }
@article{ title = {Instance Segmentation in 3D Scenes using Semantic Superpoint Tree Networks}, type = {article}, year = {2021}, pages = {2783-2792}, websites = {http://arxiv.org/abs/2108.07478}, id = {c86ca77f-04c3-3098-9aa5-8b08bd73bd51}, created = {2021-10-13T14:40:12.187Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:28.430Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Instance segmentation in 3D scenes is fundamental in many applications of scene understanding. It is yet challenging due to the compound factors of data irregularity and uncertainty in the numbers of instances. State-of-the-art methods largely rely on a general pipeline that first learns point-wise features discriminative at semantic and instance levels, followed by a separate step of point grouping for proposing object instances. While promising, they have the shortcomings that (1) the second step is not supervised by the main objective of instance segmentation, and (2) their point-wise feature learning and grouping are less effective to deal with data irregularities, possibly resulting in fragmented segmentations. To address these issues, we propose in this work an end-to-end solution of Semantic Superpoint Tree Network (SSTNet) for proposing object instances from scene points. Key in SSTNet is an intermediate, semantic superpoint tree (SST), which is constructed based on the learned semantic features of superpoints, and which will be traversed and split at intermediate tree nodes for proposals of object instances. We also design in SSTNet a refinement module, termed CliqueNet, to prune superpoints that may be wrongly grouped into instance proposals. Experiments on the benchmarks of ScanNet and S3DIS show the efficacy of our proposed method. At the time of submission, SSTNet ranks top on the ScanNet (V2) leaderboard, with 2% higher of mAP than the second best method. The source code in PyTorch is available at https://github.com/Gorilla-Lab-SCUT/SSTNet.}, bibtype = {article}, author = {Liang, Zhihao and Li, Zhihao and Xu, Songcen and Tan, Mingkui and Jia, Kui} }
@article{ title = {Learning Multi-Scene Absolute Pose Regression with Transformers}, type = {article}, year = {2021}, pages = {4-7}, websites = {http://arxiv.org/abs/2103.11468}, id = {e6bf5004-3733-3d39-87c2-3ca0984a2cb1}, created = {2021-10-13T14:40:12.228Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:29.190Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Absolute camera pose regressors estimate the position and orientation of a camera from the captured image alone. Typically, a convolutional backbone with a multi-layer perceptron head is trained with images and pose labels to embed a single reference scene at a time. Recently, this scheme was extended for learning multiple scenes by replacing the MLP head with a set of fully connected layers. In this work, we propose to learn multi-scene absolute camera pose regression with Transformers, where encoders are used to aggregate activation maps with self-attention and decoders transform latent features and scenes encoding into candidate pose predictions. This mechanism allows our model to focus on general features that are informative for localization while embedding multiple scenes in parallel. We evaluate our method on commonly benchmarked indoor and outdoor datasets and show that it surpasses both multi-scene and state-of-the-art single-scene absolute pose regressors. We make our code publicly available from https://github.com/yolish/multi-scene-pose-transformer.}, bibtype = {article}, author = {Shavit, Yoli and Ferens, Ron and Keller, Yosi} }
@article{ title = {Improving 3D Object Detection with Channel-wise Transformer}, type = {article}, year = {2021}, pages = {2743-2752}, websites = {http://arxiv.org/abs/2108.10723}, id = {61263b20-597e-3e81-a929-90df13b64c30}, created = {2021-10-13T14:40:12.254Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:43:23.125Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0}, private_publication = {false}, abstract = {Though 3D object detection from point clouds has achieved rapid progress in recent years, the lack of flexible and high-performance proposal refinement remains a great hurdle for existing state-of-the-art two-stage detectors. Previous works on refining 3D proposals have relied on human-designed components such as keypoints sampling, set abstraction and multi-scale feature fusion to produce powerful 3D object representations. Such methods, however, have limited ability to capture rich contextual dependencies among points. In this paper, we leverage the high-quality region proposal network and a Channel-wise Transformer architecture to constitute our two-stage 3D object detection framework (CT3D) with minimal hand-crafted design. The proposed CT3D simultaneously performs proposal-aware embedding and channel-wise context aggregation for the point features within each proposal. Specifically, CT3D uses proposal's keypoints for spatial contextual modelling and learns attention propagation in the encoding module, mapping the proposal to point embeddings. Next, a new channel-wise decoding module enriches the query-key interaction via channel-wise re-weighting to effectively merge multi-level contexts, which contributes to more accurate object predictions. Extensive experiments demonstrate that our CT3D method has superior performance and excellent scalability. Remarkably, CT3D achieves the AP of 81.77% in the moderate car category on the KITTI test 3D detection benchmark, outperforms state-of-the-art 3D detectors.}, bibtype = {article}, author = {Sheng, Hualian and Cai, Sijia and Liu, Yuan and Deng, Bing and Huang, Jianqiang and Hua, Xian-Sheng and Zhao, Min-Jian} }
@article{ title = {SAT: 2D Semantics Assisted Training for 3D Visual Grounding}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.11450}, id = {96f6b47b-b381-334f-b899-cbf0bdc563f9}, created = {2021-10-13T14:40:12.268Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:29.593Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {3D visual grounding aims at grounding a natural language description about a 3D scene, usually represented in the form of 3D point clouds, to the targeted object region. Point clouds are sparse, noisy, and contain limited semantic information compared with 2D images. These inherent limitations make the 3D visual grounding problem more challenging. In this study, we propose 2D Semantics Assisted Training (SAT) that utilizes 2D image semantics in the training stage to ease point-cloud-language joint representation learning and assist 3D visual grounding. The main idea is to learn auxiliary alignments between rich, clean 2D object representations and the corresponding objects or mentioned entities in 3D scenes. SAT takes 2D object semantics, i.e., object label, image feature, and 2D geometric feature, as the extra input in training but does not require such inputs during inference. By effectively utilizing 2D semantics in training, our approach boosts the accuracy on the Nr3D dataset from 37.7% to 49.2%, which significantly surpasses the non-SAT baseline with the identical network architecture and inference input. Our approach outperforms the state of the art by large margins on multiple 3D visual grounding datasets, i.e., +10.4% absolute accuracy on Nr3D, +9.9% on Sr3D, and +5.6% on ScanRef.}, bibtype = {article}, author = {Yang, Zhengyuan and Zhang, Songyang and Wang, Liwei and Luo, Jiebo} }
@article{ title = {GraphFPN: Graph Feature Pyramid Network for Object Detection}, type = {article}, year = {2021}, pages = {2763-2772}, websites = {http://arxiv.org/abs/2108.00580}, id = {02650f40-5e6a-3fe8-95ed-c4036be55dc0}, created = {2021-10-13T14:40:12.300Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T12:52:20.634Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Feature pyramids have been proven powerful in image understanding tasks that require multi-scale features. State-of-the-art methods for multi-scale feature learning focus on performing feature interactions across space and scales using neural networks with a fixed topology. In this paper, we propose graph feature pyramid networks that are capable of adapting their topological structures to varying intrinsic image structures and supporting simultaneous feature interactions across all scales. We first define an image-specific superpixel hierarchy for each input image to represent its intrinsic image structures. The graph feature pyramid network inherits its structure from this superpixel hierarchy. Contextual and hierarchical layers are designed to achieve feature interactions within the same scale and across different scales. To make these layers more powerful, we introduce two types of local channel attention for graph neural networks by generalizing global channel attention for convolutional neural networks. The proposed graph feature pyramid network can enhance the multiscale features from a convolutional feature pyramid network. We evaluate our graph feature pyramid network in the object detection task by integrating it into the Faster R-CNN algorithm. The modified algorithm outperforms not only previous state-of-the-art feature pyramid-based methods with a clear margin but also other popular detection methods on both MS-COCO 2017 validation and test datasets.}, bibtype = {article}, author = {Zhao, Gangming and Ge, Weifeng and Yu, Yizhou} }
@article{ title = {Pyramid R-CNN: Towards Better Performance and Adaptability for 3D Object Detection}, type = {article}, year = {2021}, pages = {2723-2732}, websites = {http://arxiv.org/abs/2109.02499}, id = {78be5e7d-939c-3e1d-b52f-8dcc1101b692}, created = {2021-10-13T14:40:12.310Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:29.362Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present a flexible and high-performance framework, named Pyramid R-CNN, for two-stage 3D object detection from point clouds. Current approaches generally rely on the points or voxels of interest for RoI feature extraction on the second stage, but cannot effectively handle the sparsity and non-uniform distribution of those points, and this may result in failures in detecting objects that are far away. To resolve the problems, we propose a novel second-stage module, named pyramid RoI head, to adaptively learn the features from the sparse points of interest. The pyramid RoI head consists of three key components. Firstly, we propose the RoI-grid Pyramid, which mitigates the sparsity problem by extensively collecting points of interest for each RoI in a pyramid manner. Secondly, we propose RoI-grid Attention, a new operation that can encode richer information from sparse points by incorporating conventional attention-based and graph-based point operators into a unified formulation. Thirdly, we propose the Density-Aware Radius Prediction (DARP) module, which can adapt to different point density levels by dynamically adjusting the focusing range of RoIs. Combining the three components, our pyramid RoI head is robust to the sparse and imbalanced circumstances, and can be applied upon various 3D backbones to consistently boost the detection performance. Extensive experiments show that Pyramid R-CNN outperforms the state-of-the-art 3D detection models by a large margin on both the KITTI dataset and the Waymo Open dataset.}, bibtype = {article}, author = {Mao, Jiageng and Niu, Minzhe and Bai, Haoyue and Liang, Xiaodan and Xu, Hang and Xu, Chunjing} }
@article{ title = {Graph Constrained Data Representation Learning for Human Motion Segmentation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2107.13362}, id = {370c78a5-e88b-340b-af27-c92fe7dbe51d}, created = {2021-10-13T14:40:12.326Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:29.937Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Recently, transfer subspace learning based approaches have shown to be a valid alternative to unsupervised subspace clustering and temporal data clustering for human motion segmentation (HMS). These approaches leverage prior knowledge from a source domain to improve clustering performance on a target domain, and currently they represent the state of the art in HMS. Bucking this trend, in this paper, we propose a novel unsupervised model that learns a representation of the data and digs clustering information from the data itself. Our model is reminiscent of temporal subspace clustering, but presents two critical differences. First, we learn an auxiliary data matrix that can deviate from the initial data, hence confer more degrees of freedom to the coding matrix. Second, we introduce a regularization term for this auxiliary data matrix that preserves the local geometrical structure present in the high-dimensional space. The proposed model is efficiently optimized by using an original Alternating Direction Method of Multipliers (ADMM) formulation allowing to learn jointly the auxiliary data representation, a nonnegative dictionary and a coding matrix. Experimental results on four benchmark datasets for HMS demonstrate that our approach achieves significantly better clustering performance then state-of-the-art methods, including both unsupervised and more recent semi-supervised transfer learning approaches.}, bibtype = {article}, author = {Dimiccoli, Mariella and Garrido, Lluís and Rodriguez-Corominas, Guillem and Wendt, Herwig} }
@article{ title = {Deep 3D Mask Volume for View Synthesis of Dynamic Scenes}, type = {article}, year = {2021}, pages = {1749-1758}, websites = {http://cseweb.ucsd.edu/}, id = {d2183210-bb06-3850-ac1e-4d65fcc8e9a2}, created = {2021-10-13T14:40:12.354Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:29.765Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Image view synthesis has seen great success in reconstructing photorealistic visuals, thanks to deep learning and various novel representations. The next key step in immer-sive virtual experiences is view synthesis of dynamic scenes. However, several challenges exist due to the lack of high-quality training datasets, and the additional time dimension for videos of dynamic scenes. To address this issue, we introduce a multi-view video dataset, captured with a custom 10-camera rig in 120FPS. The dataset contains 96 high-quality scenes showing various visual effects and human interactions in outdoor scenes. We develop a new algorithm , Deep 3D Mask Volume, which enables temporally-stable view extrapolation from binocular videos of dynamic scenes, captured by static cameras. Our algorithm addresses the temporal inconsistency of disocclusions by identifying the error-prone areas with a 3D mask volume, and replaces them with static background observed throughout the video. Our method enables manipulation in 3D space as opposed to simple 2D masks, We demonstrate better temporal stability than frame-by-frame static view synthesis methods , or those that use 2D masks. The resulting view synthesis videos show minimal flickering artifacts and allow for larger translational movements.}, bibtype = {article}, author = {Lin, Kai-En and Xiao, Lei and Liu, Feng and Yang, Guowei and Ramamoorthi, Ravi}, journal = {Iccv} }
@article{ title = {Learning Canonical 3D Object Representation for Fine-Grained Recognition}, type = {article}, year = {2021}, pages = {1035-1045}, websites = {http://arxiv.org/abs/2108.04628}, id = {393d4a0b-b7d8-3856-b0c7-a110ccbe53c2}, created = {2021-10-13T14:40:12.460Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:30.413Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We propose a novel framework for fine-grained object recognition that learns to recover object variation in 3D space from a single image, trained on an image collection without using any ground-truth 3D annotation. We accomplish this by representing an object as a composition of 3D shape and its appearance, while eliminating the effect of camera viewpoint, in a canonical configuration. Unlike conventional methods modeling spatial variation in 2D images only, our method is capable of reconfiguring the appearance feature in a canonical 3D space, thus enabling the subsequent object classifier to be invariant under 3D geometric variation. Our representation also allows us to go beyond existing methods, by incorporating 3D shape variation as an additional cue for object recognition. To learn the model without ground-truth 3D annotation, we deploy a differentiable renderer in an analysis-by-synthesis framework. By incorporating 3D shape and appearance jointly in a deep representation, our method learns the discriminative representation of the object and achieves competitive performance on fine-grained image recognition and vehicle re-identification. We also demonstrate that the performance of 3D shape reconstruction is improved by learning fine-grained shape deformation in a boosting manner.}, bibtype = {article}, author = {Joung, Sunghun and Kim, Seungryong and Kim, Minsu and Kim, Ig-Jae and Sohn, Kwanghoon}, number = {c} }
@article{ title = {PICCOLO: Point Cloud-Centric Omnidirectional Localization}, type = {article}, year = {2021}, pages = {3313-3323}, websites = {http://arxiv.org/abs/2108.06545}, id = {12c4d492-a378-35f2-992f-129a41704319}, created = {2021-10-13T14:40:12.556Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.341Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present PICCOLO, a simple and efficient algorithm for omnidirectional localization. Given a colored point cloud and a 360 panorama image of a scene, our objective is to recover the camera pose at which the panorama image is taken. Our pipeline works in an off-the-shelf manner with a single image given as a query and does not require any training of neural networks or collecting ground-truth poses of images. Instead, we match each point cloud color to the holistic view of the panorama image with gradient-descent optimization to find the camera pose. Our loss function, called sampling loss, is point cloud-centric, evaluated at the projected location of every point in the point cloud. In contrast, conventional photometric loss is image-centric, comparing colors at each pixel location. With a simple change in the compared entities, sampling loss effectively overcomes the severe visual distortion of omnidirectional images, and enjoys the global context of the 360 view to handle challenging scenarios for visual localization. PICCOLO outperforms existing omnidirectional localization algorithms in both accuracy and stability when evaluated in various environments.}, bibtype = {article}, author = {Kim, Junho and Choi, Changwoon and Jang, Hojun and Kim, Young Min} }
@article{ title = {Exploring Geometry-aware Contrast and Clustering Harmonization for Self-supervised 3D Object Detection}, type = {article}, year = {2021}, pages = {3293-3302}, id = {bd253c61-4863-3a96-a097-ef8ca287a263}, created = {2021-10-13T14:40:12.588Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.674Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Current 3D object detection paradigms highly rely on extensive annotation efforts, which makes them not practical in many real-world industrial applications. Inspired by that a human driver can keep accumulating experiences from self-exploring the roads without any tutor's guidance, we first step forwards to explore a simple yet effective self-supervised learning framework tailored for LiDAR-based 3D object detection. Although the self-supervised pipeline has achieved great success in 2D domain, the characteristic challenges (e.g., complex geometry structure and various 3D object views) encountered in the 3D domain hinder the direct adoption of existing techniques that often contrast the 2D augmented data or cluster single-view features. Here we present a novel self-supervised 3D Object detection framework that seamlessly integrates the geometry-aware contrast and clustering harmonization to lift the unsupervised 3D representation learning, named GCC-3D. First, GCC-3D introduces a Geometric-Aware Contrastive objective to learn spatial-sensitive local structure representation. This objective enforces the spatially close voxels to have high feature similarity. Second, a Pseudo-Instance Clustering harmonization mechanism is proposed to encourage that different views of pseudo-instances should have consistent similarities to clustering prototype centers. This module endows our model semantic discriminative capacity. Extensive experiments demonstrate our GCC-3D achieves significant performance improvement on data-efficient 3D object detection benchmarks (nuScenes and Waymo). Moreover, our GCC-3D framework can achieve state-of-the art performances on all popular 3D object detection benchmarks.}, bibtype = {article}, author = {Iccv, Anonymous and Id, Paper} }
@article{ title = {Long-Term Temporally Consistent Unpaired Video Translation from Simulated Surgical 3D Data}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.17204}, id = {cc736110-7878-36e1-9c8e-ecfa98d2aa38}, created = {2021-10-13T14:40:12.590Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.164Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Research in unpaired video translation has mainly focused on short-term temporal consistency by conditioning on neighboring frames. However for transfer from simulated to photorealistic sequences, available information on the underlying geometry offers potential for achieving global consistency across views. We propose a novel approach which combines unpaired image translation with neural rendering to transfer simulated to photorealistic surgical abdominal scenes. By introducing global learnable textures and a lighting-invariant view-consistency loss, our method produces consistent translations of arbitrary views and thus enables long-term consistent video synthesis. We design and test our model to generate video sequences from minimally-invasive surgical abdominal scenes. Because labeled data is often limited in this domain, photorealistic data where ground truth information from the simulated domain is preserved is especially relevant. By extending existing image-based methods to view-consistent videos, we aim to impact the applicability of simulated training and evaluation environments for surgical applications. Code and data will be made publicly available soon.}, bibtype = {article}, author = {Rivoir, Dominik and Pfeiffer, Micha and Docea, Reuben and Kolbinger, Fiona and Riediger, Carina and Weitz, Jürgen and Speidel, Stefanie} }
@article{ title = {RePOSE: Fast 6D Object Pose Refinement via Deep Texture Rendering}, type = {article}, year = {2021}, pages = {3303-3312}, websites = {http://arxiv.org/abs/2104.00633}, id = {7b0a1c13-c317-3807-9f21-a1c1585c625a}, created = {2021-10-13T14:40:12.643Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.517Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present RePOSE, a fast iterative refinement method for 6D object pose estimation. Prior methods perform refinement by feeding zoomed-in input and rendered RGB images into a CNN and directly regressing an update of a refined pose. Their runtime is slow due to the computational cost of CNN, which is especially prominent in multiple-object pose refinement. To overcome this problem, RePOSE leverages image rendering for fast feature extraction using a 3D model with a learnable texture. We call this deep texture rendering, which uses a shallow multi-layer perceptron to directly regress a view-invariant image representation of an object. Furthermore, we utilize differentiable Levenberg-Marquardt (LM) optimization to refine a pose fast and accurately by minimizing the feature-metric error between the input and rendered image representations without the need of zooming in. These image representations are trained such that differentiable LM optimization converges within few iterations. Consequently, RePOSE runs at 92 FPS and achieves state-of-the-art accuracy of 51.6% on the Occlusion LineMOD dataset - a 4.1% absolute improvement over the prior art, and comparable result on the YCB-Video dataset with a much faster runtime. The code is available at https://github.com/sh8/repose.}, bibtype = {article}, author = {Iwase, Shun and Liu, Xingyu and Khirodkar, Rawal and Yokota, Rio and Kitani, Kris M.} }
@article{ title = {Voxel Transformer for 3D Object Detection}, type = {article}, year = {2021}, pages = {3164-3173}, websites = {http://arxiv.org/abs/2109.02497}, id = {c78ba4a1-2aed-3161-8a95-563af37e1f4f}, created = {2021-10-13T14:40:12.667Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.982Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present Voxel Transformer (VoTr), a novel and effective voxel-based Transformer backbone for 3D object detection from point clouds. Conventional 3D convolutional backbones in voxel-based 3D detectors cannot efficiently capture large context information, which is crucial for object recognition and localization, owing to the limited receptive fields. In this paper, we resolve the problem by introducing a Transformer-based architecture that enables long-range relationships between voxels by self-attention. Given the fact that non-empty voxels are naturally sparse but numerous, directly applying standard Transformer on voxels is non-trivial. To this end, we propose the sparse voxel module and the submanifold voxel module, which can operate on the empty and non-empty voxel positions effectively. To further enlarge the attention range while maintaining comparable computational overhead to the convolutional counterparts, we propose two attention mechanisms for multi-head attention in those two modules: Local Attention and Dilated Attention, and we further propose Fast Voxel Query to accelerate the querying process in multi-head attention. VoTr contains a series of sparse and submanifold voxel modules and can be applied in most voxel-based detectors. Our proposed VoTr shows consistent improvement over the convolutional baselines while maintaining computational efficiency on the KITTI dataset and the Waymo Open dataset.}, bibtype = {article}, author = {Mao, Jiageng and Xue, Yujing and Niu, Minzhe and Bai, Haoyue and Feng, Jiashi and Liang, Xiaodan and Xu, Hang and Xu, Chunjing} }
@article{ title = {Geometry Uncertainty Projection Network for Monocular 3D Object Detection}, type = {article}, year = {2021}, pages = {3111-3121}, websites = {http://arxiv.org/abs/2107.13774}, id = {b540c9cc-0c72-317d-ac8f-92bd8583ece1}, created = {2021-10-13T14:40:12.707Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:05.732Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Geometry Projection is a powerful depth estimation method in monocular 3D object detection. It estimates depth dependent on heights, which introduces mathematical priors into the deep model. But projection process also introduces the error amplification problem, in which the error of the estimated height will be amplified and reflected greatly at the output depth. This property leads to uncontrollable depth inferences and also damages the training efficiency. In this paper, we propose a Geometry Uncertainty Projection Network (GUP Net) to tackle the error amplification problem at both inference and training stages. Specifically, a GUP module is proposed to obtains the geometry-guided uncertainty of the inferred depth, which not only provides high reliable confidence for each depth but also benefits depth learning. Furthermore, at the training stage, we propose a Hierarchical Task Learning strategy to reduce the instability caused by error amplification. This learning algorithm monitors the learning situation of each task by a proposed indicator and adaptively assigns the proper loss weights for different tasks according to their pre-tasks situation. Based on that, each task starts learning only when its pre-tasks are learned well, which can significantly improve the stability and efficiency of the training process. Extensive experiments demonstrate the effectiveness of the proposed method. The overall model can infer more reliable object depth than existing methods and outperforms the state-of-the-art image-based monocular 3D detectors by 3.74% and 4.7% AP40 of the car and pedestrian categories on the KITTI benchmark.}, bibtype = {article}, author = {Lu, Yan and Ma, Xinzhu and Yang, Lei and Zhang, Tianzhu and Liu, Yating and Chu, Qi and Yan, Junjie and Ouyang, Wanli} }
@article{ title = {Group-Free 3D Object Detection via Transformers}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.00678}, id = {07c4dd4d-e090-3886-884c-dcb94cfbae63}, created = {2021-10-13T14:40:12.714Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:27.687Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers \citevaswani2017attention, where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D. The code and models are publicly available at \urlhttps://github.com/zeliu98/Group-Free-3D}, bibtype = {article}, author = {Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin} }
@article{ title = {End-to-End Semi-Supervised Object Detection with Soft Teacher}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2106.09018}, id = {1d72ff15-5bc1-3f73-b62d-22a1137fc0c9}, created = {2021-10-13T14:40:12.773Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:27.476Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {This paper presents an end-to-end semi-supervised object detection approach, in contrast to previous more complex multi-stage methods. The end-to-end training gradually improves pseudo label qualities during the curriculum, and the more and more accurate pseudo labels in turn benefit object detection training. We also propose two simple yet effective techniques within this framework: a soft teacher mechanism where the classification loss of each unlabeled bounding box is weighed by the classification score produced by the teacher network; a box jittering approach to select reliable pseudo boxes for the learning of box regression. On the COCO benchmark, the proposed approach outperforms previous methods by a large margin under various labeling ratios, i.e. 1\%, 5\% and 10\%. Moreover, our approach proves to perform also well when the amount of labeled data is relatively large. For example, it can improve a 40.9 mAP baseline detector trained using the full COCO training set by +3.6 mAP, reaching 44.5 mAP, by leveraging the 123K unlabeled images of COCO. On the state-of-the-art Swin Transformer based object detector (58.9 mAP on test-dev), it can still significantly improve the detection accuracy by +1.5 mAP, reaching 60.4 mAP, and improve the instance segmentation accuracy by +1.2 mAP, reaching 52.4 mAP. Further incorporating with the Object365 pre-trained model, the detection accuracy reaches 61.3 mAP and the instance segmentation accuracy reaches 53.0 mAP, pushing the new state-of-the-art.}, bibtype = {article}, author = {Xu, Mengde and Zhang, Zheng and Hu, Han and Wang, Jianfeng and Wang, Lijuan and Wei, Fangyun and Bai, Xiang and Liu, Zicheng} }
@article{ title = {An End-to-End Transformer Model for 3D Object Detection}, type = {article}, year = {2021}, pages = {2906-2917}, websites = {http://arxiv.org/abs/2109.08141}, id = {f4a99d77-13b6-38a7-988d-7ff0628a36d4}, created = {2021-10-13T14:40:12.784Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:27.985Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We propose 3DETR, an end-to-end Transformer based object detection model for 3D point clouds. Compared to existing detection methods that employ a number of 3D-specific inductive biases, 3DETR requires minimal modifications to the vanilla Transformer block. Specifically, we find that a standard Transformer with non-parametric queries and Fourier positional embeddings is competitive with specialized architectures that employ libraries of 3D-specific operators with hand-tuned hyperparameters. Nevertheless, 3DETR is conceptually simple and easy to implement, enabling further improvements by incorporating 3D domain knowledge. Through extensive experiments, we show 3DETR outperforms the well-established and highly optimized VoteNet baselines on the challenging ScanNetV2 dataset by 9.5%. Furthermore, we show 3DETR is applicable to 3D tasks beyond detection, and can serve as a building block for future research.}, bibtype = {article}, author = {Misra, Ishan and Girdhar, Rohit and Joulin, Armand} }
@article{ title = {MLVSNet : Multi-level Voting Siamese Network for 3D Visual Tracking}, type = {article}, year = {2021}, id = {445b4dd8-97db-341c-a89e-a9da0522cf06}, created = {2021-10-13T14:40:12.810Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:27.308Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Area, Search} }
@article{ title = {CvT: Introducing Convolutions to Vision Transformers}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.15808}, month = {3}, day = {29}, id = {2c082ca1-733a-303b-bc39-2e9f30ce0f0c}, created = {2021-10-14T06:53:42.895Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:31.966Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present in this paper a new architecture, named Convolutional vision Transformer (CvT), that improves Vision Transformer (ViT) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. This is accomplished through two primary modifications: a hierarchy of Transformers containing a new convolutional token embedding, and a convolutional Transformer block leveraging a convolutional projection. These changes introduce desirable properties of convolutional neural networks (CNNs) to the ViT architecture (\ie shift, scale, and distortion invariance) while maintaining the merits of Transformers (\ie dynamic attention, global context, and better generalization). We validate CvT by conducting extensive experiments, showing that this approach achieves state-of-the-art performance over other Vision Transformers and ResNets on ImageNet-1k, with fewer parameters and lower FLOPs. In addition, performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. Pre-trained on ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. Finally, our results show that the positional encoding, a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks. Code will be released at \urlhttps://github.com/leoxiaobin/CvT.}, bibtype = {article}, author = {Wu, Haiping and Xiao, Bin and Codella, Noel and Liu, Mengchen and Dai, Xiyang and Yuan, Lu and Zhang, Lei} }
@article{ title = {CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification}, type = {article}, year = {2021}, websites = {https://github.com/IBM/CrossViT.}, id = {85185768-acea-33f0-b3ae-f71eb62d4cf7}, created = {2021-10-14T06:57:45.679Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:17:05.839Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The recently developed vision transformer (ViT) has achieved promising results on image classification compared to convolutional neural networks. Inspired by this, in this paper, we study how to learn multi-scale feature representations in transformer models for image classification. To this end, we propose a dual-branch transformer to combine image patches (i.e., tokens in a transformer) of different sizes to produce stronger image features. Our approach processes small-patch and large-patch tokens with two separate branches of different computational complexity and these tokens are then fused purely by attention multiple times to complement each other. Furthermore, to reduce computation, we develop a simple yet effective token fusion module based on cross attention, which uses a single token for each branch as a query to exchange information with other branches. Our proposed cross-attention only requires linear time for both computational and memory complexity instead of quadratic time otherwise. Extensive experiments demonstrate that our approach performs better than or on par with several concurrent works on vision transformer, in addition to efficient CNN models. For example, on the ImageNet1K dataset, with some architectural changes, our approach outperforms the recent DeiT by a large margin of 2% with a small to moderate increase in FLOPs and model parameters. Our source codes and models are available at https://github.com/IBM/CrossViT.}, bibtype = {article}, author = {Chen, Chun-Fu and Fan, Quanfu and Panda, Rameswar} }
@article{ title = {Learning Spatio-Temporal Transformer for Visual Tracking}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.17154}, id = {78ccd856-4cfa-3151-9e01-3d3f5c2c2ca3}, created = {2021-10-14T07:07:44.819Z}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:09:54.771Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In this paper, we present a new tracking architecture with an encoder-decoder transformer as the key component. The encoder models the global spatio-temporal feature dependencies between target objects and search regions, while the decoder learns a query embedding to predict the spatial positions of the target objects. Our method casts object tracking as a direct bounding box prediction problem, without using any proposals or predefined anchors. With the encoder-decoder transformer, the prediction of objects just uses a simple fully-convolutional network, which estimates the corners of objects directly. The whole method is end-to-end, does not need any postprocessing steps such as cosine window and bounding box smoothing, thus largely simplifying existing tracking pipelines. The proposed tracker achieves state-of-the-art performance on five challenging short-term and long-term benchmarks, while running at real-time speed, being 6x faster than Siam R-CNN. Code and models are open-sourced at https://github.com/researchmm/Stark.}, bibtype = {article}, author = {Yan, Bin and Peng, Houwen and Fu, Jianlong and Wang, Dong and Lu, Huchuan} }
@article{ title = {An Empirical Study of Training Self-Supervised Vision Transformers}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.02057}, id = {27a1225c-1187-38aa-832d-48bd5680777b}, created = {2021-10-14T07:07:44.822Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:07:53.205Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {This paper does not describe a novel method. Instead, it studies a straightforward, incremental, yet must-know baseline given the recent progress in computer vision: self-supervised learning for Vision Transformers (ViT). While the training recipes for standard convolutional networks have been highly mature and robust, the recipes for ViT are yet to be built, especially in the self-supervised scenarios where training becomes more challenging. In this work, we go back to basics and investigate the effects of several fundamental components for training self-supervised ViT. We observe that instability is a major issue that degrades accuracy, and it can be hidden by apparently good results. We reveal that these results are indeed partial failure, and they can be improved when training is made more stable. We benchmark ViT results in MoCo v3 and several other self-supervised frameworks, with ablations in various aspects. We discuss the currently positive evidence as well as challenges and open questions. We hope that this work will provide useful data points and experience for future research.}, bibtype = {article}, author = {Chen, Xinlei and Xie, Saining and He, Kaiming} }
@article{ title = {Understanding Robustness of Transformers for Image Classification}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.14586}, id = {012e9920-a639-374e-975e-a94c60368931}, created = {2021-10-14T07:07:44.823Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:07:57.970Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {Deep Convolutional Neural Networks (CNNs) have long been the architecture of choice for computer vision tasks. Recently, Transformer-based architectures like Vision Transformer (ViT) have matched or even surpassed ResNets for image classification. However, details of the Transformer architecture -- such as the use of non-overlapping patches -- lead one to wonder whether these networks are as robust. In this paper, we perform an extensive study of a variety of different measures of robustness of ViT models and compare the findings to ResNet baselines. We investigate robustness to input perturbations as well as robustness to model perturbations. We find that when pre-trained with a sufficient amount of data, ViT models are at least as robust as the ResNet counterparts on a broad range of perturbations. We also find that Transformers are robust to the removal of almost any single layer, and that while activations from later layers are highly correlated with each other, they nevertheless play an important role in classification.}, bibtype = {article}, author = {Bhojanapalli, Srinadh and Chakrabarti, Ayan and Glasner, Daniel and Li, Daliang and Unterthiner, Thomas and Veit, Andreas} }
@article{ title = {Vision Transformers for Dense Prediction}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2103.13413}, id = {d54bb30c-fbc2-371a-9f60-eb9dd23a08e4}, created = {2021-10-14T07:15:14.737Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:28:51.438Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art. Our models are available at https://github.com/intel-isl/DPT.}, bibtype = {article}, author = {Ranftl, René and Bochkovskiy, Alexey and Koltun, Vladlen} }
@article{ title = {CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification}, type = {article}, year = {2021}, websites = {https://github.com/IBM/CrossViT.}, id = {bc4f114a-2248-3eea-9831-4d2ae1056813}, created = {2021-10-14T07:16:56.089Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:16:59.053Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {The recently developed vision transformer (ViT) has achieved promising results on image classification compared to convolutional neural networks. Inspired by this, in this paper, we study how to learn multi-scale feature representations in transformer models for image classification. To this end, we propose a dual-branch transformer to combine image patches (i.e., tokens in a transformer) of different sizes to produce stronger image features. Our approach processes small-patch and large-patch tokens with two separate branches of different computational complexity and these tokens are then fused purely by attention multiple times to complement each other. Furthermore, to reduce computation, we develop a simple yet effective token fusion module based on cross attention, which uses a single token for each branch as a query to exchange information with other branches. Our proposed cross-attention only requires linear time for both computational and memory complexity instead of quadratic time otherwise. Extensive experiments demonstrate that our approach performs better than or on par with several concurrent works on vision transformer, in addition to efficient CNN models. For example, on the ImageNet1K dataset, with some architectural changes, our approach outperforms the recent DeiT by a large margin of 2% with a small to moderate increase in FLOPs and model parameters. Our source codes and models are available at https://github.com/IBM/CrossViT.}, bibtype = {article}, author = {Chen, Chun-Fu and Fan, Quanfu and Panda, Rameswar} }
@article{ title = {Point cloud classification with deep normalized Reeb graph convolution}, type = {article}, year = {2021}, keywords = {Graph normalization,Point cloud,Reeb graph}, pages = {104092}, volume = {106}, websites = {https://doi.org/10.1016/j.imavis.2020.104092}, publisher = {Elsevier B.V.}, id = {167e685d-2a73-3e72-b248-fae628137d49}, created = {2021-10-22T08:00:15.994Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-11T19:44:51.857Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2021}, private_publication = {false}, abstract = {Recently, plenty of deep learning methods have been proposed to handle point clouds. Almost all of them input the entire point cloud and ignore the information redundancy lying in point clouds. This paper addresses this problem by extracting the Reeb graph from point clouds, which is a much more informative and compact representation of point clouds, and then filter the graph with deep graph convolution. To be able to classify or segment point clouds well, we propose (1) Graph Normalization to transform various graphs into a canonical graph space; (2) Normalized Similarity Distance to better identify the graph structure;(3) Reeb Graph Guided Node Pooling in order to aggregate high-level features from kNN graphs. Besides, our method naturally fits into the problem of classifying point clouds with unknown orientations. In the results, we show that our method gives a competitive performance to the state-of-the-art methods and outperforms previous methods by a large margin on handling point clouds with unknown orientations.}, bibtype = {article}, author = {Wang, Weiming and You, Yang and Liu, Wenhai and Lu, Cewu}, doi = {10.1016/j.imavis.2020.104092}, journal = {Image and Vision Computing} }
@article{ title = {Geometry-Aware Self-Training for Unsupervised Domain Adaptationon Object Point Clouds}, type = {article}, year = {2021}, pages = {6403-6412}, websites = {http://arxiv.org/abs/2108.09169}, id = {d66c3327-140e-3367-9a55-9f89d6bd18f2}, created = {2021-10-30T07:28:03.703Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:18.034Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {The point cloud representation of an object can have a large geometric variation in view of inconsistent data acquisition procedure, which thus leads to domain discrepancy due to diverse and uncontrollable shape representation cross datasets. To improve discrimination on unseen distribution of point-based geometries in a practical and feasible perspective, this paper proposes a new method of geometry-aware self-training (GAST) for unsupervised domain adaptation of object point cloud classification. Specifically, this paper aims to learn a domain-shared representation of semantic categories, via two novel self-supervised geometric learning tasks as feature regularization. On one hand, the representation learning is empowered by a linear mixup of point cloud samples with their self-generated rotation labels, to capture a global topological configuration of local geometries. On the other hand, a diverse point distribution across datasets can be normalized with a novel curvature-aware distortion localization. Experiments on the PointDA-10 dataset show that our GAST method can significantly outperform the state-of-the-art methods.}, bibtype = {article}, author = {Zou, Longkun and Tang, Hui and Chen, Ke and Jia, Kui} }
@article{ title = {Syncretic Modality Collaborative Learning for Visible Infrared Person}, type = {article}, year = {2021}, pages = {225-234}, id = {b8216c5d-4ea5-349e-9383-beb40dc336c8}, created = {2021-10-30T07:28:03.751Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:41.394Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Iccv, Anonymous and Id, Paper} }
@article{ title = {Geometry-based Distance Decomposition for Monocular 3D Object Detection}, type = {article}, year = {2021}, pages = {15172-15181}, websites = {http://arxiv.org/abs/2104.03775}, id = {656c9330-b54d-37c8-b6c9-fb3e5bded415}, created = {2021-10-30T07:28:03.858Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:52.683Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Monocular 3D object detection is of great significance for autonomous driving but remains challenging. The core challenge is to predict the distance of objects in the absence of explicit depth information. Unlike regressing the distance as a single variable in most existing methods, we propose a novel geometry-based distance decomposition to recover the distance by its factors. The decomposition factors the distance of objects into the most representative and stable variables, i.e. the physical height and the projected visual height in the image plane. Moreover, the decomposition maintains the self-consistency between the two heights, leading to robust distance prediction when both predicted heights are inaccurate. The decomposition also enables us to trace the causes of the distance uncertainty for different scenarios. Such decomposition makes the distance prediction interpretable, accurate, and robust. Our method directly predicts 3D bounding boxes from RGB images with a compact architecture, making the training and inference simple and efficient. The experimental results show that our method achieves the state-of-the-art performance on the monocular 3D Object Detection and Birds Eye View tasks of the KITTI dataset, and can generalize to images with different camera intrinsics.}, bibtype = {article}, author = {Shi, Xuepeng and Ye, Qi and Chen, Xiaozhi and Chen, Chuangrong and Chen, Zhixiang and Kim, Tae-Kyun} }
@article{ title = {Learning with Noisy Labels for Robust Point Cloud Segmentation}, type = {article}, year = {2021}, pages = {6443-6452}, websites = {http://arxiv.org/abs/2107.14230}, id = {6358ae46-6d17-39e6-9fe5-3cfcf5d5f5f4}, created = {2021-10-30T07:28:03.918Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:26.275Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Point cloud segmentation is a fundamental task in 3D. Despite recent progress on point cloud segmentation with the power of deep networks, current deep learning methods based on the clean label assumptions may fail with noisy labels. Yet, object class labels are often mislabeled in real-world point cloud datasets. In this work, we take the lead in solving this issue by proposing a novel Point Noise-Adaptive Learning (PNAL) framework. Compared to existing noise-robust methods on image tasks, our PNAL is noise-rate blind, to cope with the spatially variant noise rate problem specific to point clouds. Specifically, we propose a novel point-wise confidence selection to obtain reliable labels based on the historical predictions of each point. A novel cluster-wise label correction is proposed with a voting strategy to generate the best possible label taking the neighbor point correlations into consideration. We conduct extensive experiments to demonstrate the effectiveness of PNAL on both synthetic and real-world noisy datasets. In particular, even with $60\%$ symmetric noisy labels, our proposed method produces much better results than its baseline counterpart without PNAL and is comparable to the ideal upper bound trained on a completely clean dataset. Moreover, we fully re-labeled the validation set of a popular but noisy real-world scene dataset ScanNetV2 to make it clean, for rigorous experiment and future research. Our code and data are available at \urlhttps://shuquanye.com/PNAL_website/.}, bibtype = {article}, author = {Ye, Shuquan and Chen, Dongdong and Han, Songfang and Liao, Jing} }
@article{ title = {DRINet: A Dual-Representation Iterative Learning Network for Point Cloud Segmentation}, type = {article}, year = {2021}, pages = {7447-7456}, websites = {http://arxiv.org/abs/2108.04023}, id = {eca95443-1649-3df7-97ff-17a1b7592623}, created = {2021-10-30T07:28:03.950Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:28.592Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {We present a novel and flexible architecture for point cloud segmentation with dual-representation iterative learning. In point cloud processing, different representations have their own pros and cons. Thus, finding suitable ways to represent point cloud data structure while keeping its own internal physical property such as permutation and scale-invariant is a fundamental problem. Therefore, we propose our work, DRINet, which serves as the basic network structure for dual-representation learning with great flexibility at feature transferring and less computation cost, especially for large-scale point clouds. DRINet mainly consists of two modules called Sparse Point-Voxel Feature Extraction and Sparse Voxel-Point Feature Extraction. By utilizing these two modules iteratively, features can be propagated between two different representations. We further propose a novel multi-scale pooling layer for pointwise locality learning to improve context information propagation. Our network achieves state-of-the-art results for point cloud classification and segmentation tasks on several datasets while maintaining high runtime efficiency. For large-scale outdoor scenarios, our method outperforms state-of-the-art methods with a real-time inference speed of 62ms per frame.}, bibtype = {article}, author = {Ye, Maosheng and Xu, Shuangjie and Cao, Tongyi and Chen, Qifeng} }
@article{ title = {Learning Meta-class Memory for Few-Shot Semantic Segmentation}, type = {article}, year = {2021}, pages = {517-526}, websites = {http://arxiv.org/abs/2108.02958}, id = {d5e3e8d3-4676-3568-8646-52a9df7fa357}, created = {2021-10-30T07:28:03.974Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:39.005Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Currently, the state-of-the-art methods treat few-shot semantic segmentation task as a conditional foreground-background segmentation problem, assuming each class is independent. In this paper, we introduce the concept of meta-class, which is the meta information (e.g. certain middle-level features) shareable among all classes. To explicitly learn meta-class representations in few-shot segmentation task, we propose a novel Meta-class Memory based few-shot segmentation method (MM-Net), where we introduce a set of learnable memory embeddings to memorize the meta-class information during the base class training and transfer to novel classes during the inference stage. Moreover, for the $k$-shot scenario, we propose a novel image quality measurement module to select images from the set of support images. A high-quality class prototype could be obtained with the weighted sum of support image features based on the quality measure. Experiments on both PASCAL-$5^i$ and COCO dataset shows that our proposed method is able to achieve state-of-the-art results in both 1-shot and 5-shot settings. Particularly, our proposed MM-Net achieves 37.5\% mIoU on the COCO dataset in 1-shot setting, which is 5.1\% higher than the previous state-of-the-art.}, bibtype = {article}, author = {Wu, Zhonghua and Shi, Xiangxi and lin, Guosheng and Cai, Jianfei} }
@article{ title = {Towers of Babel: Combining Images, Language, and 3D Geometry for Learning Multimodal Vision}, type = {article}, year = {2021}, pages = {428-437}, websites = {http://arxiv.org/abs/2108.05863}, id = {80757982-55b9-3e58-bc4a-0b87850dec62}, created = {2021-10-30T07:28:03.974Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:36.896Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {The abundance and richness of Internet photos of landmarks and cities has led to significant progress in 3D vision over the past two decades, including automated 3D reconstructions of the world's landmarks from tourist photos. However, a major source of information available for these 3D-augmented collections---namely language, e.g., from image captions---has been virtually untapped. In this work, we present WikiScenes, a new, large-scale dataset of landmark photo collections that contains descriptive text in the form of captions and hierarchical category names. WikiScenes forms a new testbed for multimodal reasoning involving images, text, and 3D geometry. We demonstrate the utility of WikiScenes for learning semantic concepts over images and 3D models. Our weakly-supervised framework connects images, 3D structure, and semantics---utilizing the strong constraints provided by 3D geometry---to associate semantic concepts to image pixels and 3D points.}, bibtype = {article}, author = {Wu, Xiaoshi and Averbuch-Elor, Hadar and Sun, Jin and Snavely, Noah} }
@article{ title = {CPFN: Cascaded Primitive Fitting Networks for High-Resolution Point Clouds}, type = {article}, year = {2021}, pages = {7457-7466}, websites = {http://arxiv.org/abs/2109.00113}, id = {28fc7436-cc11-3c00-92ed-fc7f9e54f91c}, created = {2021-10-30T07:28:04.150Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:29:24.954Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Representing human-made objects as a collection of base primitives has a long history in computer vision and reverse engineering. In the case of high-resolution point cloud scans, the challenge is to be able to detect both large primitives as well as those explaining the detailed parts. While the classical RANSAC approach requires case-specific parameter tuning, state-of-the-art networks are limited by memory consumption of their backbone modules such as PointNet++, and hence fail to detect the fine-scale primitives. We present Cascaded Primitive Fitting Networks (CPFN) that relies on an adaptive patch sampling network to assemble detection results of global and local primitive detection networks. As a key enabler, we present a merging formulation that dynamically aggregates the primitives across global and local scales. Our evaluation demonstrates that CPFN improves the state-of-the-art SPFN performance by 13-14% on high-resolution point cloud datasets and specifically improves the detection of fine-scale primitives by 20-22%.}, bibtype = {article}, author = {Lê, Eric-Tuan and Sung, Minhyuk and Ceylan, Duygu and Mech, Radomir and Boubekeur, Tamy and Mitra, Niloy J.} }
@article{ title = {Joint Representation Learning and Novel Category Discovery on Single- and Multi-modal Data}, type = {article}, year = {2021}, pages = {610-619}, websites = {http://arxiv.org/abs/2104.12673}, id = {d157b2c8-a64c-371b-83b6-e430d7b713db}, created = {2021-10-30T07:28:04.181Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:32:04.265Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {This paper studies the problem of novel category discovery on single- and multi-modal data with labels from different but relevant categories. We present a generic, end-to-end framework to jointly learn a reliable representation and assign clusters to unlabelled data. To avoid over-fitting the learnt embedding to labelled data, we take inspiration from self-supervised representation learning by noise-contrastive estimation and extend it to jointly handle labelled and unlabelled data. In particular, we propose using category discrimination on labelled data and cross-modal discrimination on multi-modal data to augment instance discrimination used in conventional contrastive learning approaches. We further employ Winner-Take-All (WTA) hashing algorithm on the shared representation space to generate pairwise pseudo labels for unlabelled data to better predict cluster assignments. We thoroughly evaluate our framework on large-scale multi-modal video benchmarks Kinetics-400 and VGG-Sound, and image benchmarks CIFAR10, CIFAR100 and ImageNet, obtaining state-of-the-art results.}, bibtype = {article}, author = {Jia, Xuhui and Han, Kai and Zhu, Yukun and Green, Bradley} }
@article{ title = {Learning Inner-Group Relations on Point Clouds}, type = {article}, year = {2021}, pages = {15477-15487}, websites = {http://arxiv.org/abs/2108.12468}, id = {df10465c-85a6-3ecf-a4b3-e576fe3c3b88}, created = {2021-10-30T07:28:04.287Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:54.492Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {The prevalence of relation networks in computer vision is in stark contrast to underexplored point-based methods. In this paper, we explore the possibilities of local relation operators and survey their feasibility. We propose a scalable and efficient module, called group relation aggregator. The module computes a feature of a group based on the aggregation of the features of the inner-group points weighted by geometric relations and semantic relations. We adopt this module to design our RPNet. We further verify the expandability of RPNet, in terms of both depth and width, on the tasks of classification and segmentation. Surprisingly, empirical results show that wider RPNet fits for classification, while deeper RPNet works better on segmentation. RPNet achieves state-of-the-art for classification and segmentation on challenging benchmarks. We also compare our local aggregator with PointNet++, with around 30% parameters and 50% computation saving. Finally, we conduct experiments to reveal the robustness of RPNet with regard to rigid transformation and noises.}, bibtype = {article}, author = {Ran, Haoxi and Zhuo, Wei and Liu, Jun and Lu, Li} }
@article{ title = {LSG-CPD: Coherent Point Drift with Local Surface Geometry for Point Cloud Registration}, type = {article}, year = {2021}, pages = {15293-15302}, websites = {http://arxiv.org/abs/2103.15039}, id = {e4597dcb-2bc2-3484-86ad-835ee28a4537}, created = {2021-10-30T07:28:04.507Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:29:01.268Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Probabilistic point cloud registration methods are becoming more popular because of their robustness. However, unlike point-to-plane variants of iterative closest point (ICP) which incorporate local surface geometric information such as surface normals, most probabilistic methods (e.g., coherent point drift (CPD)) ignore such information and build Gaussian mixture models (GMMs) with isotropic Gaussian covariances. This results in sphere-like GMM components which only penalize the point-to-point distance between the two point clouds. In this paper, we propose a novel method called CPD with Local Surface Geometry (LSG-CPD) for rigid point cloud registration. Our method adaptively adds different levels of point-to-plane penalization on top of the point-to-point penalization based on the flatness of the local surface. This results in GMM components with anisotropic covariances. We formulate point cloud registration as a maximum likelihood estimation (MLE) problem and solve it with the Expectation-Maximization (EM) algorithm. In the E step, we demonstrate that the computation can be recast into simple matrix manipulations and efficiently computed on a GPU. In the M step, we perform an unconstrained optimization on a matrix Lie group to efficiently update the rigid transformation of the registration. The proposed method outperforms state-of-the-art algorithms in terms of accuracy and robustness on various datasets captured with range scanners, RGBD cameras, and LiDARs. Also, it is significantly faster than modern implementations of CPD. The code will be released.}, bibtype = {article}, author = {Liu, Weixiao and Wu, Hongtao and Chirikjian, Gregory} }
@article{ title = {Towards Efficient Graph Convolutional Networks for Point Cloud Handling}, type = {article}, year = {2021}, pages = {3752-3762}, websites = {http://arxiv.org/abs/2104.05706}, id = {7af17445-bb34-368b-98fe-6ec107796dc0}, created = {2021-10-30T07:28:04.637Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:29:09.097Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {In this paper, we aim at improving the computational efficiency of graph convolutional networks (GCNs) for learning on point clouds. The basic graph convolution that is typically composed of a $K$-nearest neighbor (KNN) search and a multilayer perceptron (MLP) is examined. By mathematically analyzing the operations there, two findings to improve the efficiency of GCNs are obtained. (1) The local geometric structure information of 3D representations propagates smoothly across the GCN that relies on KNN search to gather neighborhood features. This motivates the simplification of multiple KNN searches in GCNs. (2) Shuffling the order of graph feature gathering and an MLP leads to equivalent or similar composite operations. Based on those findings, we optimize the computational procedure in GCNs. A series of experiments show that the optimized networks have reduced computational complexity, decreased memory consumption, and accelerated inference speed while maintaining comparable accuracy for learning on point clouds. Code will be available at \urlhttps://github.com/ofsoundof/EfficientGCN.git.}, bibtype = {article}, author = {Li, Yawei and Chen, He and Cui, Zhaopeng and Timofte, Radu and Pollefeys, Marc and Chirikjian, Gregory and Van Gool, Luc} }
@article{ title = {LIGA-Stereo: Learning LiDAR Geometry Aware Representations for Stereo-based 3D Detector}, type = {article}, year = {2021}, pages = {3153-3163}, websites = {http://arxiv.org/abs/2108.08258}, id = {9a3897ac-b5e1-39f0-9117-207b0421e502}, created = {2021-10-30T07:28:04.772Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:32:13.653Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Stereo-based 3D detection aims at detecting 3D object bounding boxes from stereo images using intermediate depth maps or implicit 3D geometry representations, which provides a low-cost solution for 3D perception. However, its performance is still inferior compared with LiDAR-based detection algorithms. To detect and localize accurate 3D bounding boxes, LiDAR-based models can encode accurate object boundaries and surface normal directions from LiDAR point clouds. However, the detection results of stereo-based detectors are easily affected by the erroneous depth features due to the limitation of stereo matching. To solve the problem, we propose LIGA-Stereo (LiDAR Geometry Aware Stereo Detector) to learn stereo-based 3D detectors under the guidance of high-level geometry-aware representations of LiDAR-based detection models. In addition, we found existing voxel-based stereo detectors failed to learn semantic features effectively from indirect 3D supervisions. We attach an auxiliary 2D detection head to provide direct 2D semantic supervisions. Experiment results show that the above two strategies improved the geometric and semantic representation capabilities. Compared with the state-of-the-art stereo detector, our method has improved the 3D detection performance of cars, pedestrians, cyclists by 10.44%, 5.69%, 5.97% mAP respectively on the official KITTI benchmark. The gap between stereo-based and LiDAR-based 3D detectors is further narrowed.}, bibtype = {article}, author = {Guo, Xiaoyang and Shi, Shaoshuai and Wang, Xiaogang and Li, Hongsheng} }
@article{ title = {Two Heads are Better than One : Geometric-Latent Attention for Point Cloud Segmentation}, type = {article}, year = {2021}, pages = {1-14}, id = {49ac1c29-d2f6-3023-9647-3accaa66637b}, created = {2021-11-23T08:03:05.325Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:03:20.837Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, abstract = {We present an innovative two-headed attention layer that combines geometric and latent features to segment a 3D scene into semantically meaningful subsets. Each head combines local and global information, using either the geometric or latent features, of a neighborhood of points and uses this information to learn better local relationships. This Geometric-Latent attention layer (Ge-Latto) is combined with a sub-sampling strategy to capture global features. Our method is invariant to permutation thanks to the use of shared-MLP layers, and it can also be used with point clouds with varying densities because the local attention layer does not depend on the neighbor order. Our proposal is simple yet robust, which allows it to achieve competitive results in the ShapeNetPart and ModelNet40 datasets, and the state-of-the-art when segmenting the complex dataset S3DIS, with 69.2% IoU on Area 5, and 89.7% overall accuracy using K-fold cross-validation on the 6 areas.}, bibtype = {article}, author = {Attention, Geometric-latent} }
@article{ title = {Self-Supervised Point Cloud Completion via Inpainting}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2111.10701}, id = {ae68f287-15f6-3644-844a-bd31b2e0a17b}, created = {2021-11-23T08:03:05.326Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:03:35.248Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, abstract = {When navigating in urban environments, many of the objects that need to be tracked and avoided are heavily occluded. Planning and tracking using these partial scans can be challenging. The aim of this work is to learn to complete these partial point clouds, giving us a full understanding of the object's geometry using only partial observations. Previous methods achieve this with the help of complete, ground-truth annotations of the target objects, which are available only for simulated datasets. However, such ground truth is unavailable for real-world LiDAR data. In this work, we present a self-supervised point cloud completion algorithm, PointPnCNet, which is trained only on partial scans without assuming access to complete, ground-truth annotations. Our method achieves this via inpainting. We remove a portion of the input data and train the network to complete the missing region. As it is difficult to determine which regions were occluded in the initial cloud and which were synthetically removed, our network learns to complete the full cloud, including the missing regions in the initial partial cloud. We show that our method outperforms previous unsupervised and weakly-supervised methods on both the synthetic dataset, ShapeNet, and real-world LiDAR dataset, Semantic KITTI.}, bibtype = {article}, author = {Mittal, Himangi and Okorn, Brian and Jangid, Arpit and Held, David} }
@article{ title = {Adversarial Robustness Comparison of Vision Transformer and MLP-Mixer to CNNs}, type = {article}, year = {2021}, pages = {1-16}, websites = {http://arxiv.org/abs/2110.02797}, id = {629ad79f-a3c3-32e0-8c2e-0b0cb94c0c05}, created = {2021-11-23T08:03:05.444Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:03:31.380Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, abstract = {Convolutional Neural Networks (CNNs) have become the de facto gold standard in computer vision applications in the past years. Recently, however, new model architectures have been proposed challenging the status quo. The Vision Transformer (ViT) relies solely on attention modules, while the MLP-Mixer architecture substitutes the self-attention modules with Multi-Layer Perceptrons (MLPs). Despite their great success, CNNs have been widely known to be vulnerable to adversarial attacks, causing serious concerns for security-sensitive applications. Thus, it is critical for the community to know whether the newly proposed ViT and MLP-Mixer are also vulnerable to adversarial attacks. To this end, we empirically evaluate their adversarial robustness under several adversarial attack setups and benchmark them against the widely used CNNs. Overall, we find that the two architectures, especially ViT, are more robust than their CNN models. Using a toy example, we also provide empirical evidence that the lower adversarial robustness of CNNs can be partially attributed to their shift-invariant property. Our frequency analysis suggests that the most robust ViT architectures tend to rely more on low-frequency features compared with CNNs. Additionally, we have an intriguing finding that MLP-Mixer is extremely vulnerable to universal adversarial perturbations.}, bibtype = {article}, author = {Benz, Philipp and Ham, Soomin and Zhang, Chaoning and Karjauv, Adil and Kweon, In So} }
@article{ title = {Adversarial Graph Convolutional Network for 3D Point Cloud Segmentation}, type = {article}, year = {2021}, id = {710f421d-bf1b-3d72-ac0e-62ac658b037f}, created = {2021-11-23T08:03:05.458Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:50.106Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8315fdc0-e3a9-47f0-9186-21b3433d86d2,d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Guidelines, Bmvc Author} }
@article{ title = {Multi-Modality Task Cascade for 3D Object Detection}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2107.04013}, id = {00330d4d-561e-3ac1-863c-19b2e15e4039}, created = {2021-11-23T08:03:05.469Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:11:31.086Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, abstract = {Point clouds and RGB images are naturally complementary modalities for 3D visual understanding - the former provides sparse but accurate locations of points on objects, while the latter contains dense color and texture information. Despite this potential for close sensor fusion, many methods train two models in isolation and use simple feature concatenation to represent 3D sensor data. This separated training scheme results in potentially sub-optimal performance and prevents 3D tasks from being used to benefit 2D tasks that are often useful on their own. To provide a more integrated approach, we propose a novel Multi-Modality Task Cascade network (MTC-RCNN) that leverages 3D box proposals to improve 2D segmentation predictions, which are then used to further refine the 3D boxes. We show that including a 2D network between two stages of 3D modules significantly improves both 2D and 3D task performance. Moreover, to prevent the 3D module from over-relying on the overfitted 2D predictions, we propose a dual-head 2D segmentation training and inference scheme, allowing the 2nd 3D module to learn to interpret imperfect 2D segmentation predictions. Evaluating our model on the challenging SUN RGB-D dataset, we improve upon state-of-the-art results of both single modality and fusion networks by a large margin ($\textbf+3.8$ mAP@0.5). Code will be released $\hrefhttps://github.com/Divadi/MTC_RCNN\texthere.$}, bibtype = {article}, author = {Park, Jinhyung and Weng, Xinshuo and Man, Yunze and Kitani, Kris} }
@article{ title = {Planar Shape Based Registration for Multi-modal Geometry}, type = {article}, year = {2021}, pages = {1-15}, id = {fe87832f-0b45-33d6-a9b9-4348e1f0ce6f}, created = {2021-11-23T08:03:05.475Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:49.910Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Li, Muxingzi and Antipolis, Sophia} }
@article{ title = {Enhancing Local Feature Learning for 3D Point Cloud Processing using Unary-Pairwise Attention}, type = {article}, year = {2021}, pages = {1-14}, id = {c309ed76-8c05-3121-a69b-8741bfce1707}, created = {2021-11-23T08:03:05.491Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:06:36.611Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Xiu, Haoyi} }
@article{ title = {VIN: Voxel-based Implicit Network for Joint 3D Object Detection and Segmentation for Lidars}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2107.02980}, id = {fead10e1-ad6e-3512-a1fa-649a1aef2866}, created = {2021-11-23T08:03:05.599Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:03:13.376Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, abstract = {A unified neural network structure is presented for joint 3D object detection and point cloud segmentation in this paper. We leverage rich supervision from both detection and segmentation labels rather than using just one of them. In addition, an extension based on single-stage object detectors is proposed based on the implicit function widely used in 3D scene and object understanding. The extension branch takes the final feature map from the object detection module as input, and produces an implicit function that generates semantic distribution for each point for its corresponding voxel center. We demonstrated the performance of our structure on nuScenes-lidarseg, a large-scale outdoor dataset. Our solution achieves competitive results against state-of-the-art methods in both 3D object detection and point cloud segmentation with little additional computation load compared with object detection solutions. The capability of efficient weakly supervision semantic segmentation of the proposed method is also validated by experiments.}, bibtype = {article}, author = {Zhong, Yuanxin and Zhu, Minghan and Peng, Huei} }
@article{ title = {2 . 5D-VoteNet : Depth Map based 3D Object Detection for Real-Time Applications}, type = {article}, year = {2021}, id = {8a365847-9980-38c7-81e5-9f0686137525}, created = {2021-11-23T08:03:05.603Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:51:28.691Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0,d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Li, Lanxiao and Heizmann, Michael} }
@article{ title = {Cascading Feature Extraction for Fast Point Cloud Registration}, type = {article}, year = {2021}, pages = {1-12}, websites = {http://arxiv.org/abs/2110.12204}, id = {62be2787-e332-3ea2-9d79-30e03bbaae22}, created = {2021-11-23T08:03:05.626Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:06:24.136Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, abstract = {We propose a method for speeding up a 3D point cloud registration through a cascading feature extraction. The current approach with the highest accuracy is realized by iteratively executing feature extraction and registration using deep features. However, iterative feature extraction takes time. Our proposed method significantly reduces the computational cost using cascading shallow layers. Our idea is to omit redundant computations that do not always contribute to the final accuracy. The proposed approach is approximately three times faster than the existing methods without a loss of accuracy.}, bibtype = {article}, author = {Hisadome, Yoichiro and Matsui, Yusuke} }
@article{ title = {Adaptive GMM Convolution for Point Cloud Learning}, type = {article}, year = {2021}, pages = {1-13}, id = {d3276b45-4ff9-346a-8a83-366c04d60863}, created = {2021-11-23T08:03:05.696Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:49.703Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Wang, Huan} }
@article{ title = {SwinFGHash: Fine-grained Image Retrieval via Transformer-based Hashing Network}, type = {article}, year = {2021}, id = {c0e0c8d1-ce73-377c-b170-0f8dd57f1241}, created = {2021-11-23T08:14:06.940Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-25T08:42:20.916Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Fine-grained image retrieval is a fundamental and challenging problem in computer vision due to the intra-class diversities and inter-class confusions. Existing hashing-based approaches employed convolutional neural networks (CNNs) to learn hash codes for fast fine-grained image retrieval, which are limited by the inherent locality constrain of the convolution operations and yield sub-optimal performance. Recently, transformers have shown colossal potential on vision tasks for their excellent capacity to capture long-range visual dependencies. Therefore, in this paper, we take the first step to exploit the vision transformer-based hashing network for fine-grained image retrieval. We propose the SwinFGHash, which takes advantage of transformer-based architecture to model the feature interactions among the spatially distant areas, e.g., the head and the tail of a bird on an image, thus improving the fine-grained discrimination of the generated hash codes. Besides, we enhance the critical region localization ability of SwinFGHash by designing a Global with Local (GwL) feature learning module, which preserves subtle yet discriminative features for fine-grained retrieval. Extensive experiments on benchmark datasets show that our SwinFGHash significantly outperforms existing state-of-the-art baselines in fine-grained image retrieval.}, bibtype = {article}, author = {Lu, Di and Wang, Jinpeng and Zeng, Ziyun and Chen, Bin and Wu, Shudeng and Xia, Shu-Tao} }
@article{ title = {MVT: Multi-view Vision Transformer for 3D Object Recognition}, type = {article}, year = {2021}, id = {f967c32a-c991-3a4f-b402-0e5f6fd02620}, created = {2021-11-23T08:15:46.220Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:15:48.667Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Inspired by the great success achieved by CNN in image recognition, view-based methods applied CNNs to model the projected views for 3D object understanding and achieved excellent performance. Nevertheless, multi-view CNN models cannot model the communications between patches from different views, limiting its effectiveness in 3D object recognition. Inspired by the recent success gained by vision Transformer in image recognition, we propose a Multi-view Vision Transformer (MVT) for 3D object recognition. Since each patch feature in a Transformer block has a global reception field, it naturally achieves communications between patches from different views. Meanwhile, it takes much less inductive bias compared with its CNN counterparts. Considering both effectiveness and efficiency, we develop a global-local structure for our MVT. Our experiments on two public benchmarks, ModelNet40 and ModelNet10, demonstrate the competitive performance of our MVT.}, bibtype = {article}, author = {Chen, Shuo and Yu, Tan and Li, Ping} }
@article{ title = {Grounded Situation Recognition with Transformers}, type = {article}, year = {2021}, websites = {https://github.com/jhcho99/gsrtr.}, id = {e41e75a4-8222-377b-b429-a4645139c543}, created = {2021-11-23T08:16:06.786Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:16:13.641Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Grounded Situation Recognition (GSR) is the task that not only classifies a salient action (verb), but also predicts entities (nouns) associated with semantic roles and their locations in the given image. Inspired by the remarkable success of Transformers in vision tasks, we propose a GSR model based on a Transformer encoder-decoder architecture. The attention mechanism of our model enables accurate verb classification by capturing high-level semantic feature of an image effectively, and allows the model to flexibly deal with the complicated and image-dependent relations between entities for improved noun classification and localization. Our model is the first Transformer architecture for GSR, and achieves the state of the art in every evaluation metric on the SWiG benchmark. Our code is available at https://github.com/jhcho99/gsrtr. Figure 1: Predictions of our model on the SWiG dataset.}, bibtype = {article}, author = {Cho, Junhyeong and Yoon, Youngseok and Lee, Hyeonjun and Kwak, Suha} }
@article{ title = {Exploiting Scene Depth for Object Detection with Multimodal Transformers}, type = {article}, year = {2021}, id = {066f5d94-a1bd-3e87-b328-a86271e7156d}, created = {2021-11-23T08:16:13.078Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:16:16.420Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {We propose a generic framework MEDUSA (Multimodal Estimated-Depth Unification with Self-Attention) to fuse RGB and depth information using multimodal transformers in the context of object detection. Unlike previous methods that use the depth measured from various physical sensors such as Kinect and Lidar, we show that the depth maps inferred by a monocular depth estimator can play an important role to enhance the performance of modern object detectors. In order to make use of the estimated depth, MEDUSA encompasses a robust feature extraction phase, followed by multimodal transformers for RGB-D fusion. The main strength of MEDUSA lies in its broad applicability for any existing large-scale RGB datasets including PASCAL VOC and Microsoft COCO. Extensive experiments with three datasets show that MEDUSA achieves higher precision than several strong baselines.}, bibtype = {article}, author = {Song, Hwanjun and Kim, Eunyoung and Jampani, Varun and Sun, Deqing and Lee, Jae-Gil and Yang, Ming-Hsuan} }
@article{ title = {SHI, MENG, XING, MA AND WATTENHOFER: 3D-RETR 3D-RETR: End-to-End Single and Multi-View 3D Reconstruction with Transformers}, type = {article}, year = {2021}, websites = {https://github.com/FomalhautB/3D-RETR}, id = {e2012a9e-c214-3b4c-be6c-fb38af734204}, created = {2021-11-23T08:16:20.238Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:16:24.017Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {3D reconstruction aims to reconstruct 3D objects from 2D views. Previous works for 3D reconstruction mainly focus on feature matching between views or using CNNs as backbones. Recently, Transformers have been shown effective in multiple applications of computer vision. However, whether or not Transformers can be used for 3D reconstruction is still unclear. In this paper, we fill this gap by proposing 3D-RETR, which is able to perform end-to-end 3D REconstruction with TRansformers. 3D-RETR first uses a pretrained Transformer to extract visual features from 2D input images. 3D-RETR then uses another Transformer Decoder to obtain the voxel features. A CNN Decoder then takes as input the voxel features to obtain the reconstructed objects. 3D-RETR is capable of 3D reconstruction from a single view or multiple views. Experimental results on two datasets show that 3D-RETR reaches state-of-the-art performance on 3D reconstruction. Additional ablation study also demonstrates that 3D-DETR benefits from using Transformers.}, bibtype = {article}, author = {Shi, Zai and Meng, Zhao and Ch, Zhmeng@ethz and Xing, Yiran and Ma, Yunpu and Wattenhofer, Roger and Ch, Wattenhofer@ethz and Zurich, Eth and Aachen, Rwth} }
@article{ title = {HAT-Net: A Hierarchical Transformer Graph Neural Network for Grading of Colorectal Cancer Histology Images}, type = {article}, year = {2021}, id = {da613fa3-1037-382a-a51d-e19891adfbc1}, created = {2021-11-23T08:16:26.731Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:16:32.957Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Graph-based learning methods have gained more attention in colorectal adenocarci-noma cancer (CRA) grading tasks for encoding the tissue structure information, which patch-wise CNN based methods fail to. Graph-based methods usually involve extracting nuclei features in the histology images as cell-graph node features and modeling the connections between nodes to construct cell-graphs. However, it is infeasible to directly train a classification model to extract nuclei features as we normally do in nature images since different types of nuclei often cluster together. We propose a Masked Nuclei Patch (MNP) approach to train a ResNet-50 as a strong feature encoder to extract more representative nuclei feature for enhancing the overall performance. Graph Neural Networks (GNNs) are often used to train cell-graphs for different tasks. But GNN may struggle to capture the long-range dependency due to its underlying recurrent structure. Therefore, we propose a new network architecture named HierArchical Transformer Graph Neural Network (HAT-Net), which merits both GNN and Transformer, as a strong competitor for CRA grading tasks. We have achieved the state-of-the-art results on two publicly available CRA grading datasets: the colorectal cancer (CRC) dataset (98.55%) and the extended colorectal cancer (Extended CRC) dataset (95.33%).}, bibtype = {article}, author = {Su, Yihan and Bai, Yu and Zhang, Bo and Zhang, Zheng and Wang, Wendong} }
@article{ title = {Feature Fusion Vision Transformer for Fine-Grained Visual Categorization}, type = {article}, year = {2021}, id = {8f939260-038d-321d-91de-ea60e195f413}, created = {2021-11-23T08:16:34.445Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:16:38.639Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {The core for tackling the fine-grained visual categorization (FGVC) is to learn subtle yet discriminative features. Most previous works achieve this by explicitly selecting the discriminative parts or integrating the attention mechanism via CNN-based approaches. However, these methods enhance the computational complexity and make the model dominated by the regions containing the most of the objects. Recently, vision transformer (ViT) has achieved SOTA performance on general image recognition tasks. The self-attention mechanism aggregates and weights the information from all patches to the classification token, making it perfectly suitable for FGVC. Nonetheless, the classification token in the deep layer pays more attention to the global information, lacking the local and low-level features that are essential for FGVC. In this work, we propose a novel pure transformer-based framework Feature Fusion Vision Transformer (FFVT) where we aggregate the important tokens from each transformer layer to compensate the local, low-level and middle-level information. We design a novel token selection module called mutual attention weight selection (MAWS) to guide the network effectively and efficiently towards selecting discriminative tokens without introducing extra parameters. We verify the effectiveness of FFVT on four benchmarks where FFVT achieves the state-of-the-art performance. Code is available at this link.}, bibtype = {article}, author = {Wang, Jun and Yu, Xiaohan and University, Griffith and Yongsheng Gao, Australia} }
@article{ title = {Localizing Objects with Self-Supervised Transformers and no Labels}, type = {article}, year = {2021}, websites = {https://github.com/valeoai/LOST.}, id = {6ed86187-4d6c-32b6-b58a-5a919dca068a}, created = {2021-11-23T08:16:42.752Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:16:47.452Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Localizing objects in image collections without supervision can help to avoid expensive annotation campaigns. We propose a simple approach to this problem, that leverages the activation features of a vision transformer pre-trained in a self-supervised manner. Our method, LOST, does not require any external object proposal nor any exploration of the image collection; it operates on a single image. Yet, we outperform state-of-the-art object discovery methods by up to 8 CorLoc points on PASCAL VOC 2012. We also show that training a class-agnostic detector on the discovered objects boosts results by another 7 points. Moreover, we show promising results on the unsupervised object discovery task. The code can be found at https://github.com/valeoai/LOST. Figure 1: Three applications of LOST to unsupervised single-object discovery (left), multi-object discovery (middle) and object detection (right). In the latter case, objects discovered by LOST are clustered into categories, and cluster labels are used to train a classical object detector. Although large image collections are used to train the underlying image representation [13] and the detector [51], no annotation is ever used in the pipeline. See Figure 3 and Tables 1, 3 for more experiments.}, bibtype = {article}, author = {Siméoni, Oriane and Puy, Gilles and Vo, Huy V and Roburin, Simon and Gidaris, Spyros and Bursuc, Andrei and Pérez, Patrick and Marlet, Renaud and Ponce, Jean} }
@article{ title = {Image-Text Alignment using Adaptive Cross-attention with Transformer Encoder for Scene Graphs}, type = {article}, year = {2021}, id = {6f8e91ad-49a8-38f8-86be-90375363f4f0}, created = {2021-11-23T08:18:01.815Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:17.612Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Neural image and text encoders have been proposed to align the abstract image and symbolic text representation. Global-local and local-local information integration between two modalities are essential for an effective alignment. In this paper, we present RELation-aware Adaptive Cross-attention (RELAX) that achieves state-of-the-art performance in cross-modal retrieval tasks by incorporating several novel improvements. First, cross-attention methods integrate global-local information via weighted global feature of a modality (taken as value) for a local feature of the other modality (taken as query). We can make more accurate alignments if we could also consider the global weights of the query modality. To this end, we introduce adaptive embedding to consider the weights. Second, to enhance the usage of scene-graphs that can capture the high-level relation of local features, we introduce transformer encoders for textual scene graphs to align with visual scene graphs. Lastly, we use NT-XEnt loss that takes the weighted sum of the samples based on their importance. We show that our approach is effective in extensive experiments that outperform other state-of-the-art models.}, bibtype = {article}, author = {Song, Juyong and Choi, Sunghyun} }
@article{ title = {ASFormer: Transformer for Action Segmentation}, type = {article}, year = {2021}, websites = {https://github.com/ChinaYi/ASFormer.}, id = {32a5e47d-77c9-3f91-a32f-e2fcc98c25e2}, created = {2021-11-23T08:18:08.907Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:14.752Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Algorithms for the action segmentation task typically use temporal models to predict what action is occurring at each frame for a minute-long daily activity. Recent studies have shown the potential of Transformer in modeling the relations among elements in sequential data. However, there are several major concerns when directly applying the Transformer to the action segmentation task, such as the lack of inductive biases with small training sets, the deficit in processing long input sequence, and the limitation of the decoder architecture to utilize temporal relations among multiple action segments to refine the initial predictions. To address these concerns, we design an efficient Transformer-based model for the action segmentation task, named ASFormer, with three distinctive characteristics: (i) We explicitly bring in the local connectivity inductive priors because of the high locality of features. It constrains the hypothesis space within a reliable scope, and is beneficial for the action segmentation task to learn a proper target function with small training sets. (ii) We apply a pre-defined hierarchical representation pattern that efficiently handles long input sequences. (iii) We carefully design the decoder to refine the initial predictions from the encoder. Extensive experiments on three public datasets demonstrate the effectiveness of our methods. Code is available at https://github.com/ChinaYi/ASFormer.}, bibtype = {article}, author = {Yi, Fangqiu and Wen, Hongyu and Jiang, Tingting} }
@article{ title = {Mitigating Bias in Visual Transformers via Targeted Alignment}, type = {article}, year = {2021}, id = {92994ba8-fd1d-303c-8720-f785a7a6ed4f}, created = {2021-11-23T08:18:11.707Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:14.677Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {As transformer architectures become increasingly prevalent in computer vision, it is critical to understand their fairness implications. We perform the first study of the fairness of transformers applied to computer vision and benchmark several bias miti-gation approaches from prior work. We visualize the feature space of the transformer self-attention modules and discover that a significant portion of the bias is encoded in the query matrix. With this knowledge, we propose TADeT, a targeted alignment strategy for debiasing transformers that aims to discover and remove bias primarily from query matrix features. We measure performance using Balanced Accuracy and Standard Accuracy , and fairness using Equalized Odds and Balanced Accuracy Difference. TADeT consistently leads to improved fairness over prior work on multiple attribute prediction tasks on the CelebA dataset, without compromising performance.}, bibtype = {article}, author = {Sudhakar, Sruthi and Prabhu, Viraj and Krishnakumar, Arvindkumar and Hoffman, Judy} }
@article{ title = {Paying Attention to Varying Receptive Fields: Object Detection with Atrous Filters and Vision Transformers}, type = {article}, year = {2021}, id = {93c8c6db-a45b-3ac2-9cab-946ffddf27d7}, created = {2021-11-23T08:18:22.634Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:49.814Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Object detection represents a critical component in computer vision based on its unique ability to identify the location of one or more objects in an image or video. Given its importance, various approaches were proposed in an attempt to extract meaningful and representative features across different image scales. One such approach would be to vary the receptive fields during the feature extraction process. However, varying and adjusting the receptive field adds complexity to the process of scene understanding by introducing a higher degree of unimportant semantics into the feature maps. To solve this problem, we propose a novel object detection framework by unifying dilation modules (or atrous convolutions) with a vision transformer (DIL-ViT). The proposed model leverages atrous convolutions to generate rich multi-scale feature maps and employs a self-attention mechanism to enrich important backbone features. Specifically, the dila-tion (i.e., DIL) module enables feature fusions across varying scales from a single input feature map of specific scales. Through this method, we incorporate coarse semantics and fine details into the feature maps by convolving the features with different atrous rates in a multi-branch multi-level structure. By embedding DIL into various object detectors , we observe notable improvements in all of the compared evaluation metrics using the MS-COCO dataset. To further enhance the feature maps produced by the DIL, we then apply channel-wise attention using a vision transformer (i.e., ViT). Crucially, this approach removes unnecessary semantics present in the fused multi-scale feature map. Experimental results of DIL-ViT on the MS-COCO dataset exhibit substantial improvements in all of the compared evaluation metrics.}, bibtype = {article}, author = {Lam, Arthur and Lim, Junyi and Sutopo, Ricky and Monn Baskaran, Vishnu} }
@article{ title = {Adversarial Robustness Comparison of Vision Transformer and MLP-Mixer to CNNs}, type = {article}, year = {2021}, websites = {https://github.com/phibenz/robustness_comparison_}, id = {db76e721-53a7-3911-91c8-fd817200a465}, created = {2021-11-23T08:18:28.085Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:31.013Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Convolutional Neural Networks (CNNs) have become the de facto gold standard in computer vision applications in the past years. Recently, however, new model archi-tectures have been proposed challenging the status quo. The Vision Transformer (ViT) relies solely on attention modules, while the MLP-Mixer architecture substitutes the self-attention modules with Multi-Layer Perceptrons (MLPs). Despite their great success, CNNs have been widely known to be vulnerable to adversarial attacks, causing serious concerns for security-sensitive applications. Thus, it is critical for the community to know whether the newly proposed ViT and MLP-Mixer are also vulnerable to adver-sarial attacks. To this end, we empirically evaluate their adversarial robustness under several adversarial attack setups and benchmark them against the widely used CNNs. Overall, we find that the two architectures, especially ViT, are more robust than their CNN models. Using a toy example, we also provide empirical evidence that the lower adversarial robustness of CNNs can be partially attributed to their shift-invariant property. Our frequency analysis suggests that the most robust ViT architectures tend to rely more on low-frequency features compared with CNNs. Additionally, we have an intriguing finding that MLP-Mixer is extremely vulnerable to universal adversarial perturbations. Code: https://github.com/phibenz/robustness_comparison_ vit_mlp-mixer_cnn.}, bibtype = {article}, author = {Benz, Philipp and Ham, Soomin and Zhang, Chaoning and Karjauv, Adil} }
@article{ title = {End-to-End Object Detection with Adaptive Clustering Transformer}, type = {article}, year = {2021}, websites = {https://github.com/gaopengcuhk/SMCA-DETR/}, id = {f9bdc9ef-4eef-3dbc-8298-ec993a1bd711}, created = {2021-11-23T08:18:38.432Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:42.986Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {End-to-end Object Detection with Transformer (DETR) performs object detection with Transformer and achieves comparable performance with two-stage object detection like Faster-RCNN. However, DETR needs huge computational resources for training and inference due to the high-resolution spatial inputs. In this paper, a novel variant of transformer named Adaptive Clustering Transformer (ACT) has been proposed to reduce the computation cost for high-resolution input. ACT clusters the query features adaptively using Locality Sensitive Hashing (LSH) and approximates the query-key interaction using the prototype-key interaction. ACT can reduce the quadratic O(N 2) complexity inside self-attention into O(NK) where K is the number of prototypes in each layer. ACT can be a drop-in module replacing the original self-attention module without any training. ACT achieves a good balance between accuracy and computation cost (FLOPs). The code is available as supplementary for the ease of experiment replication and verification. Code is released at https://github.com/gaopengcuhk/SMCA-DETR/ tree/main/Adaptive_Cluster_Transformer}, bibtype = {article}, author = {Zheng, Minghang and Gao, Peng and Zhang, Renrui and Li, Kunchang and Wang, Xiaogang and Li, Hongsheng and Dong, Hao} }
@article{ title = {FETNet: Feature Exchange Transformer Network for RGB-D Object Detection}, type = {article}, year = {2021}, id = {86ed9f88-ce64-3a47-b733-fd4ab1a06820}, created = {2021-11-23T08:18:43.040Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:46.141Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {In RGB-D object detection, due to the inherent difference between the RGB and Depth modalities, it remains challenging to simultaneously leverage sensed photomet-ric and depth information. In this paper, to address this issue, we propose a Feature Exchange Transformer Network (FETNet), which consists of two well-designed components: the Feature Exchange Module (FEM), and the Multi-modal Vision Transformer (MViT). Specially, we propose the FEM to exchange part of the channels between RGB and depth features at each backbone stage, which facilitates the information flow, and bridges the gap, between the two modalities. Inspired by the success of Vision Transformer (ViT), we develop the variant MViT to effectively fuse multi-modal features and exploit the attention between the RGB and depth features. Different from previous methods developing from specified RGB detection algorithm, our proposal is generic. Extensive experiments prove that, when the proposed modules are integrated into mainstream RGB object detection methods, their RGB-D counterparts can obtain significant performance gains. Moreover, our FETNet surpasses state-of-the-art RGB-D detectors by 7.0% mAP on SUN RGB-D and 1.7% mAP on NYU Depth v2, which also well demonstrates the effectiveness of the proposed method.}, bibtype = {article}, author = {Xiao, Zhibin and Xue, Jing-Hao and Xie, Pengwei and Wang, Guijin} }
@article{ title = {Multi-Exit Vision Transformer for Dynamic Inference}, type = {article}, year = {2021}, websites = {https://gitlab.au.dk/maleci/multiexitvit.}, id = {4a64a821-708f-397e-a9bf-5dae3a1e06c1}, created = {2021-11-23T08:18:49.722Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:57.464Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Deep neural networks can be converted to multi-exit architectures by inserting early exit branches after some of their intermediate layers. This allows their inference process to become dynamic, which is useful for time critical IoT applications with stringent latency requirements, but with time-variant communication and computation resources. In particular, in edge computing systems and IoT networks where the exact computation time budget is variable and not known beforehand. Vision Transformer is a recently proposed architecture which has since found many applications across various domains of computer vision. In this work, we propose seven different architectures for early exit branches that can be used for dynamic inference in Vision Transformer backbones. Through extensive experiments involving both classification and regression problems, we show that each one of our proposed architectures could prove useful in the trade-off between accuracy and speed.}, bibtype = {article}, author = {Bakhtiarnia, Arian and Zhang, Qi and Iosifidis, Alexandros and Digit, Ai@ece Au Dk} }
@article{ title = {Livestock Monitoring with Transformer}, type = {article}, year = {2021}, websites = {http://www.fao.org/faostat/en/}, id = {1f874559-3d83-3200-a662-f58847221269}, created = {2021-11-23T08:19:02.987Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:10.853Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Tracking the behaviour of livestock enables early detection and thus prevention of contagious diseases in modern animal farms. Apart from economic gains, this would reduce the amount of antibiotics used in livestock farming which otherwise enters the human diet exasperating the epidemic of antibiotic resistance-a leading cause of death. We could use standard video cameras, available in most modern farms, to monitor livestock. However, most computer vision algorithms perform poorly on this task, primarily because, (i) animals bred in farms look identical, lacking any obvious spatial signature, (ii) none of the existing trackers are robust for long duration, and (iii) real-world conditions such as changing illumination, frequent occlusion, varying camera angles, and sizes of the animals make it hard for models to generalize. Given these challenges, we develop an end-to-end behaviour monitoring system for group-housed pigs to perform simultaneous instance level segmentation, tracking, action recognition and re-identification (STAR) tasks. We present STARFORMER, the first end-to-end multiple-object livestock monitoring framework that learns instance-level embeddings for grouped pigs through the use of transformer architecture. For benchmarking, we present PIGTRACE, a carefully curated dataset comprising video sequences with instance level bounding box, seg-mentation, tracking and activity classification of pigs in real indoor farming environment. Using simultaneous optimization on STAR tasks we show that STARFORMER outper-forms popular baseline models trained for individual tasks.}, bibtype = {article}, author = {Tangirala, Bhavesh and Bhandari, Ishan and Laszlo, Daniel and Gupta, Deepak K and Thomas, Rajat M and Arya, Devanshu and Nl, D Arya@uva} }
@article{ title = {Multi-Teacher Single-Student Visual Transformer with Multi-Level Attention for Face Spoofing Detection}, type = {article}, year = {2021}, websites = {https://liveness.com/.}, id = {9a2a9fb4-5e6d-3c67-afbd-896f43a5b8a8}, created = {2021-11-23T08:19:07.581Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:10.417Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Face biometrics have attracted significant attention in many security-based applications. The presentation attack (PA) or face spoofing is a cybercriminal attempt to gain illegitimate access to a victim's device using photos, videos, or 3D artificial masks of the victim's face. Various deep learning approaches can tackle particular PA attacks when tested on standard datasets. However, these methods fail to generalize to complex environments or unseen datasets. We propose a new Multi-Teacher Single-Student (MTSS) visual Transformer with a multi-level attention design to improve the general-izability of face spoofing detection. Then, a novel Multi-Level Attention Module with a DropBlock (MAMD) is designed to strengthen discriminative features while dropping irrelevant spatial features to avoid overfitting. Finally, these rich convolutional feature sets are combined and fed into the MTSS network for face spoofing training. With this MAMD module, our method survives well under small training datasets with poorly lighted conditions. Experimental results demonstrate the superiority of our method when compared with several anti-spoofing methods on four datasets (CASIA-MFSD, Replay-Attack, MSU-MFSD, and OULU-NPU). Furthermore, our model can run on Jetson TX2 up to 80 FPS for real-world applications.}, bibtype = {article}, author = {Huang, Yao-Hui and Hsieh, Jun-Wei and Chang, Ming-Ching and Ke, Lipeng and Lyu, Siwei and Santra, Arpita Samanta} }
@article{ title = {OODformer: Out-Of-Distribution Detection Transformer}, type = {article}, year = {2021}, websites = {https://github.com/rajatkoner08/oodformer.}, id = {59a23fd0-d1f9-32e5-944f-67758b105b39}, created = {2021-11-23T08:19:13.339Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:16.779Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {A serious problem in image classification is that a trained model might perform well for input data that originates from the same distribution as the data available for model training, but performs much worse for out-of-distribution (OOD) samples. In real-world safety-critical applications, in particular, it is important to be aware if a new data point is OOD. To date, OOD detection is typically addressed using either confidence scores, auto-encoder based reconstruction, or contrastive learning. However, the global image context has not yet been explored to discriminate the non-local objectness between in-distribution and OOD samples. This paper proposes a first-of-its-kind OOD detection architecture named OODformer that leverages the contextualization capabilities of the transformer. Incorporating the transformer as the principal feature extractor allows us to exploit the object concepts and their discriminatory attributes along with their co-occurrence via visual attention. Based on contextualised embedding, we demonstrate OOD detection using both class-conditioned latent space similarity and a network confidence score. Our approach shows improved generalizability across various datasets. We have achieved a new state-of-the-art result on CIFAR-10/-100 and ImageNet30. Code is available at : https://github.com/rajatkoner08/oodformer.}, bibtype = {article}, author = {Koner, Rajat and Sinhamahapatra, Poulami and Roscher, Karsten and Günnemann, Stephan and Tresp, Volker} }
@article{ title = {Transformer-based Monocular Depth Estimation with Attention Supervision}, type = {article}, year = {2021}, websites = {https://github.com/WJ-Chang-42/ASTransformer.}, id = {86494662-a547-383f-b7c0-8a86cd76d42f}, created = {2021-11-23T08:19:19.453Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:24.846Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Transformer, which excels in capturing long-range dependencies, has shown great performance in a variety of computer vision tasks. In this paper, we propose a hybrid network with a Transformer-based encoder and a CNN-based decoder for monocular depth estimation. The encoder follows the architecture of classical Vision Transformer. To better exploit the potential of the Transformer encoder, we introduce the Attention Supervision to the Transformer layer, which enhances the representative ability. The down-sampling operations before the Transformer encoder lead to degradation of the details in the predicted depth map. Thus, we devise an Attention-based Up-sample Block and deploy it to compensate the texture features. Experiments on both indoor and outdoor datasets demonstrate that the proposed method achieves the state-of-the-art performance on both quantitative and qualitative evaluations. The source code and trained models can be downloaded at https://github.com/WJ-Chang-42/ASTransformer.}, bibtype = {article}, author = {Chang, Wenjie and Zhang, Yueyi and Xiong, Zhiwei} }
@article{ title = {Few-Shot Temporal Action Localization with Query Adaptive Transformer}, type = {article}, year = {2021}, websites = {https://github.com/sauradip/fewshotQAT}, id = {bf359bae-3b9d-3d2a-9ffd-1e14b65e549f}, created = {2021-11-23T08:19:31.278Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:34.109Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Existing temporal action localization (TAL) works rely on a large number of training videos with exhaustive segment-level annotation, preventing them from scaling to new classes. As a solution to this problem, few-shot TAL (FS-TAL) aims to adapt a model to a new class represented by as few as a single video. Exiting FS-TAL methods assume trimmed training videos for new classes. However, this setting is not only unnatural-actions are typically captured in untrimmed videos, but also ignores background video segments containing vital contextual cues for foreground action segmentation. In this work, we first propose a new FS-TAL setting by proposing to use untrimmed training videos. Further, a novel FS-TAL model is proposed which maximizes the knowledge transfer from training classes whilst enabling the model to be dynamically adapted to both the new class and each video of that class simultaneously. This is achieved by introducing a query adaptive Transformer in the model. Extensive experiments on two action localization benchmarks demonstrate that our method can outperform all the state-of-the-art alternatives significantly in both single-domain and cross-domain scenarios. The source code can be found in https://github.com/sauradip/fewshotQAT}, bibtype = {article}, author = {Nag, Sauradip and Zhu, Xiatian and Xiang, Tao} }
@article{ title = {PROF: SATOSHI IKEHATA PS-Transformer: Learning Sparse Photometric Stereo Network using Self-Attention Mechanism Satoshi Ikehata}, type = {article}, year = {2021}, websites = {https://satoshi-ikehata.github.io/}, id = {6ce4ec11-c7a9-39eb-af39-f75ff339dc96}, created = {2021-11-23T08:19:36.456Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:39.730Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Existing deep calibrated photometric stereo networks basically aggregate observations under different lights based on the pre-defined operations such as linear projection and max pooling. While they are effective with the dense capture, simple first-order operations often fail to capture the high-order interactions among observations under small number of different lights. To tackle this issue, this paper presents a deep sparse calibrated photometric stereo network named PS-Transformer which leverages the learnable self-attention mechanism to properly capture the complex inter-image interactions. PS-Transformer builds upon the dual-branch design to explore both pixel-wise and image-wise features and individual feature is trained with the intermediate surface normal supervision to maximize geometric feasibility. A new synthetic dataset named CyclesPS+ is also presented with the comprehensive analysis to successfully train the photometric stereo networks. Extensive results on the publicly available benchmark datasets demonstrate that the surface normal prediction accuracy of the proposed method significantly outperforms other state-of-the-art algorithms with the same number of input images and is even comparable to that of dense algorithms which input 10× larger number of images.}, bibtype = {article}, author = {} }
@article{ title = {Training Graph Neural Networks with 1000 Layers}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2106.07476}, id = {d649ab2e-9617-3883-b3db-9fe0e1d2922c}, created = {2021-12-09T14:47:57.163Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:48:02.742Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Deep graph neural networks (GNNs) have achieved excellent results on various tasks on increasingly large graph datasets with millions of nodes and edges. However, memory complexity has become a major obstacle when training deep GNNs for practical applications due to the immense number of nodes, edges, and intermediate activations. To improve the scalability of GNNs, prior works propose smart graph sampling or partitioning strategies to train GNNs with a smaller set of nodes or sub-graphs. In this work, we study reversible connections, group convolutions, weight tying, and equilibrium models to advance the memory and parameter efficiency of GNNs. We find that reversible connections in combination with deep network architectures enable the training of overparameterized GNNs that significantly outperform existing methods on multiple datasets. Our models RevGNN-Deep (1001 layers with 80 channels each) and RevGNN-Wide (448 layers with 224 channels each) were both trained on a single commodity GPU and achieve an ROC-AUC of $87.74 \pm 0.13$ and $88.24 \pm 0.15$ on the ogbn-proteins dataset. To the best of our knowledge, RevGNN-Deep is the deepest GNN in the literature by one order of magnitude. Please visit our project website https://www.deepgcns.org/arch/gnn1000 for more information.}, bibtype = {article}, author = {Li, Guohao and Müller, Matthias and Ghanem, Bernard and Koltun, Vladlen} }
@article{ title = {Adversarial Attack on Graph Neural Networks as An Influence Maximization Problem}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2106.10785}, id = {493b5d2d-9763-3326-94c5-faacbc88615a}, created = {2022-01-05T09:23:15.556Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:26.889Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks (GNNs) have attracted increasing interests. With broad deployments of GNNs in real-world applications, there is an urgent need for understanding the robustness of GNNs under adversarial attacks, especially in realistic setups. In this work, we study the problem of attacking GNNs in a restricted and realistic setup, by perturbing the features of a small set of nodes, with no access to model parameters and model predictions. Our formal analysis draws a connection between this type of attacks and an influence maximization problem on the graph. This connection not only enhances our understanding on the problem of adversarial attack on GNNs, but also allows us to propose a group of effective and practical attack strategies. Our experiments verify that the proposed attack strategies significantly degrade the performance of three popular GNN models and outperform baseline adversarial attack strategies.}, bibtype = {article}, author = {Ma, Jiaqi and Deng, Junwei and Mei, Qiaozhu} }
@article{ title = {Adversarial Attacks on Graph Classification via Bayesian Optimisation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2111.02842}, id = {e90f4522-1c23-3ff2-bcc1-17e9e770699e}, created = {2022-01-05T09:23:15.561Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:11.812Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks, a popular class of models effective in a wide range of graph-based learning tasks, have been shown to be vulnerable to adversarial attacks. While the majority of the literature focuses on such vulnerability in node-level classification tasks, little effort has been dedicated to analysing adversarial attacks on graph-level classification, an important problem with numerous real-life applications such as biochemistry and social network analysis. The few existing methods often require unrealistic setups, such as access to internal information of the victim models, or an impractically-large number of queries. We present a novel Bayesian optimisation-based attack method for graph classification models. Our method is black-box, query-efficient and parsimonious with respect to the perturbation applied. We empirically validate the effectiveness and flexibility of the proposed method on a wide range of graph classification tasks involving varying graph properties, constraints and modes of attack. Finally, we analyse common interpretable patterns behind the adversarial samples produced, which may shed further light on the adversarial robustness of graph classification models.}, bibtype = {article}, author = {Wan, Xingchen and Kenlay, Henry and Ru, Binxin and Blaas, Arno and Osborne, Michael A. and Dong, Xiaowen}, number = {NeurIPS} }
@article{ title = {Robust graph convolutional networks with directional graph adversarial training}, type = {article}, year = {2021}, keywords = {Adversarial training,Graph adversarial learning,Graph convolutional networks,Robustness}, pages = {7812-7826}, volume = {51}, publisher = {Applied Intelligence}, id = {37c0232c-275e-35cc-b492-108d13816464}, created = {2022-01-05T09:23:15.677Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:29.056Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph convolutional networks (GCNs), an emerging type of neural network model on graphs, have presented state-of-the-art performance on the node classification task. However, recent studies show that neural networks are vulnerable to the small but deliberate perturbations on input features. And GCNs could be more sensitive to the perturbations since the perturbations from neighbor nodes exacerbate the impact on a target node through the convolution. Adversarial training (AT) is a regularization technique that has been shown capable of improving the robustness of the model against perturbations on image classification. However, directly adopting AT on GCNs is less effective since AT regards examples as independent of each other and does not consider the impact from connected examples. In this work, we explore AT on graph and propose a graph-specific AT method, Directional Graph Adversarial Training (DGAT), which incorporates the graph structure into the adversarial process and automatically identifies the impact of perturbations from neighbor nodes. Concretely, we consider the impact from the connected nodes to define the neighbor perturbation which restricts the perturbation direction on node features towards their neighbor nodes, and additionally introduce an adversarial regularizer to defend the worst-case perturbations. In this way, DGAT can resist the impact of worst-case adversarial perturbations and reduce the impact of perturbations from neighbor nodes. Extensive experiments demonstrate that DGAT can effectively improve the robustness and generalization performance of GCNs. Specially, GCNs with DGAT can provide better performance when there are rare few labels available for training.}, bibtype = {article}, author = {Hu, Weibo and Chen, Chuan and Chang, Yaomin and Zheng, Zibin and Du, Yunfei}, doi = {10.1007/s10489-021-02272-y}, journal = {Applied Intelligence}, number = {11} }
@article{ title = {Graph Adversarial Attack via Rewiring}, type = {article}, year = {2021}, keywords = {adversarial attack,graph neural networks,rewiring}, pages = {1161-1169}, id = {9d034f24-6a22-35cc-8097-58813eddcaff}, created = {2022-01-05T09:23:15.687Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:23.073Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs) have demonstrated their powerful capability in learning representations for graph-structured data. Consequently, they have enhanced the performance of many graph-related tasks such as node classification and graph classification. However, it is evident from recent studies that GNNs are vulnerable to adversarial attacks. Their performance can be largely impaired by deliberately adding carefully created unnoticeable perturbations to the graph. Existing attacking methods often produce perturbation by adding/deleting a few edges, which might be noticeable even when the number of modified edges is small. In this paper, we propose a graph rewiring operation to perform the attack. It can affect the graph in a less noticeable way compared to existing operations such as adding/deleting edges. We then utilize deep reinforcement learning to learn the strategy to effectively perform the rewiring operations. Experiments on real-world graphs demonstrate the effectiveness of the proposed framework. To understand the proposed framework, we further analyze how its generated perturbation impacts the target model and the advantages of the rewiring operations. The implementation of the proposed framework is available at https://github.com/alge24/ReWatt.}, bibtype = {article}, author = {Ma, Yao and Wang, Suhang and Derr, Tyler and Wu, Lingfei and Tang, Jiliang}, doi = {10.1145/3447548.3467416}, journal = {Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining} }
@article{ title = {Task and Model Agnostic Adversarial Attack on Graph Neural Networks}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2112.13267}, id = {13ce2963-423a-3037-b682-f167861bdfd4}, created = {2022-01-05T09:23:15.689Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:24.258Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks (GNNs) have witnessed significant adoption in the industry owing to impressive performance on various predictive tasks. Performance alone, however, is not enough. Any widely deployed machine learning algorithm must be robust to adversarial attacks. In this work, we investigate this aspect for GNNs, identify vulnerabilities, and link them to graph properties that may potentially lead to the development of more secure and robust GNNs. Specifically, we formulate the problem of task and model agnostic evasion attacks where adversaries modify the test graph to affect the performance of any unknown downstream task. The proposed algorithm, GRAND ($Gr$aph $A$ttack via $N$eighborhood $D$istortion) shows that distortion of node neighborhoods is effective in drastically compromising prediction performance. Although neighborhood distortion is an NP-hard problem, GRAND designs an effective heuristic through a novel combination of Graph Isomorphism Network with deep $Q$-learning. Extensive experiments on real datasets show that, on average, GRAND is up to $50\%$ more effective than state of the art techniques, while being more than $100$ times faster.}, bibtype = {article}, author = {Sharma, Kartik and Verma, Samidha and Medya, Sourav and Ranu, Sayan and Bhattacharya, Arnab} }
@article{ title = {Understanding Structural Vulnerability in Graph Convolutional Networks}, type = {article}, year = {2021}, pages = {2249-2255}, id = {9bdc1b50-e4f7-3d41-9360-c2873c4c7f9e}, created = {2022-01-05T09:23:15.839Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:32.747Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Recent studies have shown that Graph Convolutional Networks (GCNs) are vulnerable to adversarial attacks on the graph structure. Although multiple works have been proposed to improve their robustness against such structural adversarial attacks, the reasons for the success of the attacks remain unclear. In this work, we theoretically and empirically demonstrate that structural adversarial examples can be attributed to the non-robust aggregation scheme (i.e., the weighted mean) of GCNs. Specifically, our analysis takes advantage of the breakdown point which can quantitatively measure the robustness of aggregation schemes. The key insight is that weighted mean, as the basic design of GCNs, has a low breakdown point and its output can be dramatically changed by injecting a single edge. We show that adopting the aggregation scheme with a high breakdown point (e.g., median or trimmed mean) could significantly enhance the robustness of GCNs against structural attacks. Extensive experiments on four real-world datasets demonstrate that such a simple but effective method achieves the best robustness performance compared to state-of-the-art models.}, bibtype = {article}, author = {Chen, Liang and Li, Jintang and Peng, Qibiao and Liu, Yang and Zheng, Zibin and Yang, Carl}, doi = {10.24963/ijcai.2021/310} }
@article{ title = {Graph Universal Adversarial Attacks: A Few Bad Actors Ruin Graph Learning Models}, type = {article}, year = {2021}, pages = {3328-3334}, id = {db81a873-a406-3b9b-bfdd-7c76bbcd2d04}, created = {2022-01-05T09:23:16.325Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:57.768Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep neural networks, while generalize well, are known to be sensitive to small adversarial perturbations. This phenomenon poses severe security threat and calls for in-depth investigation of the robustness of deep learning models. With the emergence of neural networks for graph structured data, similar investigations are urged to understand their robustness. It has been found that adversarially perturbing the graph structure and/or node features may result in a significant degradation of the model performance. In this work, we show from a different angle that such fragility similarly occurs if the graph contains a few bad-actor nodes, which compromise a trained graph neural network through flipping the connections to any targeted victim. Worse, the bad actors found for one graph model severely compromise other models as well. We call the bad actors ``anchor nodes'' and propose an algorithm, named GUA, to identify them. Thorough empirical investigations suggest an interesting finding that the anchor nodes often belong to the same class; and they also corroborate the intuitive trade-off between the number of anchor nodes and the attack success rate. For the dataset Cora which contains 2708 nodes, as few as six anchor nodes will result in an attack success rate higher than 80% for GCN and other three models.}, bibtype = {article}, author = {Zang, Xiao and Xie, Yi and Chen, Jie and Yuan, Bo}, doi = {10.24963/ijcai.2021/458} }
@article{ title = {Adversarial Attacks and Defenses on Graphs}, type = {article}, year = {2021}, pages = {19-34}, volume = {22}, id = {d75d4e71-09a2-3d8c-b8a3-a138ba09791a}, created = {2022-01-05T09:23:16.452Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:51.812Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep neural networks (DNNs) have achieved significant performance in various tasks. However, recent studies have shown that DNNs can be easily fooled by small perturbation on the input, called adversarial attacks.}, bibtype = {article}, author = {Jin, Wei and Li, Yaxing and Xu, Han and Wang, Yiqi and Ji, Shuiwang and Aggarwal, Charu and Tang, Jiliang}, doi = {10.1145/3447556.3447566}, journal = {ACM SIGKDD Explorations Newsletter}, number = {2} }
@article{ title = {Detection and Defense of Topological Adversarial Attacks on Graphs}, type = {article}, year = {2021}, pages = {2989-2997}, volume = {130}, websites = {http://proceedings.mlr.press/v130/zhang21i.html%0Ahttp://proceedings.mlr.press/v130/zhang21i/zhang21i.pdf}, id = {5cc56d4e-a026-36ac-bfc6-4363f81c109d}, created = {2022-01-05T09:23:16.472Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:02.878Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural network (GNN) models achieve superior performance when classifying nodes in graph-structured data. Given that state-of-the-art GNNs share many similarities with their CNN cousins and that CNNs suffer ad-versarial vulnerabilities, there has also been interest in exploring analogous vulnerabilities in GNNs. Indeed, recent work has demonstrated that node classification performance of several graph models, including the popular graph convolution network (GCN) model, can be severely degraded through adversar-ial perturbations to the graph structure and the node features. In this work, we take a first step towards detecting adversarial attacks against graph models. We first propose a straightforward single node threshold test for detecting nodes subject to targeted attacks. Subsequently, we describe a kernel-based two-sample test for detecting whether a given subset of nodes within a graph has been maliciously corrupted. The efficacy of our algorithms is established via thorough experiments using commonly used node classification benchmark datasets. We also illustrate the potential practical benefit of our detection method by demonstrating its application to a real-world Bitcoin transaction network.}, bibtype = {article}, author = {Zhang, Yingxue and Regol, Florence and Pal, Soumyasundar and Khan, Sakif and Ma, Liheng and Coates, Mark}, journal = {Proceedings of the 24th International Conference on Artificial Intelligence and Statistics (AISTATS)} }
@article{ title = {Jointly Attacking Graph Neural Network and its Explanations}, type = {article}, year = {2021}, pages = {1-17}, websites = {http://arxiv.org/abs/2108.03388}, id = {a8b82ff9-dafe-3e30-b1a6-570cfbc94990}, created = {2022-01-05T09:23:16.511Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:06.090Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs) have boosted the performance for many graph-related tasks. Despite the great success, recent studies have shown that GNNs are highly vulnerable to adversarial attacks, where adversaries can mislead the GNNs' prediction by modifying graphs. On the other hand, the explanation of GNNs (GNNExplainer) provides a better understanding of a trained GNN model by generating a small subgraph and features that are most influential for its prediction. In this paper, we first perform empirical studies to validate that GNNExplainer can act as an inspection tool and have the potential to detect the adversarial perturbations for graphs. This finding motivates us to further initiate a new problem investigation: Whether a graph neural network and its explanations can be jointly attacked by modifying graphs with malicious desires? It is challenging to answer this question since the goals of adversarial attacks and bypassing the GNNExplainer essentially contradict each other. In this work, we give a confirmative answer to this question by proposing a novel attack framework (GEAttack), which can attack both a GNN model and its explanations by simultaneously exploiting their vulnerabilities. Extensive experiments on two explainers (GNNExplainer and PGExplainer) under various real-world datasets demonstrate the effectiveness of the proposed method.}, bibtype = {article}, author = {Fan, Wenqi and Jin, Wei and Liu, Xiaorui and Xu, Han and Tang, Xianfeng and Wang, Suhang and Li, Qing and Tang, Jiliang and Wang, Jianping and Aggarwal, Charu} }
@article{ title = {Adversarial Attacks on Graph Classification via Bayesian Optimisation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2111.02842}, id = {e1aebb08-4818-351c-9755-8ea1880dd92b}, created = {2022-01-05T09:23:16.542Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:53.307Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks, a popular class of models effective in a wide range of graph-based learning tasks, have been shown to be vulnerable to adversarial attacks. While the majority of the literature focuses on such vulnerability in node-level classification tasks, little effort has been dedicated to analysing adversarial attacks on graph-level classification, an important problem with numerous real-life applications such as biochemistry and social network analysis. The few existing methods often require unrealistic setups, such as access to internal information of the victim models, or an impractically-large number of queries. We present a novel Bayesian optimisation-based attack method for graph classification models. Our method is black-box, query-efficient and parsimonious with respect to the perturbation applied. We empirically validate the effectiveness and flexibility of the proposed method on a wide range of graph classification tasks involving varying graph properties, constraints and modes of attack. Finally, we analyse common interpretable patterns behind the adversarial samples produced, which may shed further light on the adversarial robustness of graph classification models.}, bibtype = {article}, author = {Wan, Xingchen and Kenlay, Henry and Ru, Binxin and Blaas, Arno and Osborne, Michael A. and Dong, Xiaowen}, number = {NeurIPS} }
@article{ title = {Expressive 1-Lipschitz Neural Networks for Robust Multiple Graph Learning against Adversarial Attacks}, type = {article}, year = {2021}, pages = {12719-12735}, volume = {139}, websites = {https://proceedings.mlr.press/v139/zhao21e.html}, id = {a44cbbf2-9194-34a2-925c-c6f14589db3f}, created = {2022-01-05T09:23:16.590Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:58.577Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Recent findings have shown multiple graph learning models, such as graph classification and graph matching, are highly vulnerable to adversarial attacks, i.e. small input perturbations in graph structures and node attributes can cause the model failures. Existing defense techniques often defend specific attacks on particular multiple graph learning tasks. This paper proposes an attack-agnostic graph-adaptive 1-Lipschitz neural network, ERNN, for improving the robustness of deep multiple graph learning while achieving remarkable expressive power. A K_l-Lipschitz Weibull activation function is designed to enforce the gradient norm as K_l at layer l. The nearest matrix orthogonalization and polar decomposition techniques are utilized to constraint the weight norm as 1/K_l and make the norm-constrained weight close to the original weight. The theoretical analysis is conducted to derive lower and upper bounds of feasible K_l under the 1-Lipschitz constraint. The combination of norm-constrained weight and activation function leads to the 1-Lipschitz neural network for expressive and robust multiple graph learning.}, bibtype = {article}, author = {Zhao, Xin and Zhang, Zeru and Zhang, Zijie and Wu, Lingfei and Jin, Jiayin and Zhou, Yang and Jin, Ruoming and Dou, Dejing and Yan, Da}, journal = {Proceedings of the 38th International Conference on Machine Learning} }
@article{ title = {Generating Adversarial Examples with Graph Neural Networks}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.14644}, id = {8bc7d272-1df0-3ac9-9e64-90997b163f71}, created = {2022-01-05T09:23:16.689Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:14.393Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Recent years have witnessed the deployment of adversarial attacks to evaluate the robustness of Neural Networks. Past work in this field has relied on traditional optimization algorithms that ignore the inherent structure of the problem and data, or generative methods that rely purely on learning and often fail to generate adversarial examples where they are hard to find. To alleviate these deficiencies, we propose a novel attack based on a graph neural network (GNN) that takes advantage of the strengths of both approaches; we call it AdvGNN. Our GNN architecture closely resembles the network we wish to attack. During inference, we perform forward-backward passes through the GNN layers to guide an iterative procedure towards adversarial examples. During training, its parameters are estimated via a loss function that encourages the efficient computation of adversarial examples over a time horizon. We show that our method beats state-of-the-art adversarial attacks, including PGD-attack, MI-FGSM, and Carlini and Wagner attack, reducing the time required to generate adversarial examples with small perturbation norms by over 65\%. Moreover, AdvGNN achieves good generalization performance on unseen networks. Finally, we provide a new challenging dataset specifically designed to allow for a more illustrative comparison of adversarial attacks.}, bibtype = {article}, author = {Jaeckle, Florian and Kumar, M. Pawan}, number = {Uai} }
@article{ title = {Adversarial Attack on Large Scale Graph}, type = {article}, year = {2021}, keywords = {Adversarial attack,Data models,Deep learning,Efficient attack,Graph neural networks,Measurement,Network robustness,Node classification,Perturbation methods,Robustness,Task analysis}, pages = {1-13}, volume = {4347}, id = {2140e7f5-a048-316f-9ee6-e2c957881a6a}, created = {2022-01-05T09:23:16.769Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:09.247Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Recent studies have shown that graph neural networks (GNNs) are vulnerable against perturbations due to lack of robustness and can therefore be easily fooled. Currently, most works on attacking GNNs are mainly using gradient information to guide the attack and achieve outstanding performance. However, the high complexity of time and space makes them unmanageable for large scale graphs and becomes the major bottleneck that prevents the practical usage. We argue that the main reason is that they have to use the whole graph for attacks, resulting in the increasing time and space complexity as the data scale grows. In this work, we propose an efficient Simplified Gradient-based Attack (SGA) method to bridge this gap. SGA can cause the GNNs to misclassify specific target nodes through a multi-stage attack framework, which needs only a much smaller subgraph. In addition, we present a practical metric named Degree Assortativity Change (DAC) to measure the impacts of adversarial attacks on graph data. We evaluate our attack method on four real-world graph networks by attacking several commonly used GNNs. The experimental results demonstrate that SGA can achieve significant time and memory efficiency improvements while maintaining competitive attack performance compared to state-of-art attack techniques.}, bibtype = {article}, author = {Li, Jintang and Xie, Tao and Liang, Chen and Xie, Fenfang and He, Xiangnan and Zheng, Zibin}, doi = {10.1109/TKDE.2021.3078755}, journal = {IEEE Transactions on Knowledge and Data Engineering}, number = {c} }
@article{ title = {VoxelContext-Net: An Octree based Framework for Point Cloud Compression}, type = {article}, year = {2021}, pages = {6038-6047}, id = {093ccb69-5e87-336e-95fb-91ff1f6c5ea0}, created = {2022-01-11T11:30:16.906Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-11T11:30:23.434Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {9972d981-f25e-4229-94fb-1c4fc6296c30}, private_publication = {false}, abstract = {In this paper, we propose a two-stage deep learning framework called VoxelContext-Net for both static and dynamic point cloud compression. Taking advantages of both octree based methods and voxel based schemes, our approach employs the voxel context to compress the octree structured data. Specifically, we first extract the local voxel representation that encodes the spatial neighbouring context information for each node in the constructed octree. Then, in the entropy coding stage, we propose a voxel context based deep entropy model to compress the symbols of non-leaf nodes in a lossless way. Furthermore, for dynamic point cloud compression, we additionally introduce the local voxel representations from the temporal neighbouring point clouds to exploit temporal dependency. More importantly, to alleviate the distortion from the octree construction procedure, we propose a voxel context based 3D coordinate refinement method to produce more accurate reconstructed point cloud at the decoder side, which is applicable to both static and dynamic point cloud compression. The comprehensive experiments on both static and dynamic point cloud benchmark datasets(e.g., ScanNet and Semantic KITTI) clearly demonstrate the effectiveness of our newly proposed method VoxelContext-Net for 3D point cloud geometry compression.}, bibtype = {article}, author = {Que, Zizheng and Lu, Guo and Xu, Dong}, doi = {10.1109/cvpr46437.2021.00598} }
@article{ title = {Learning-based lossless compression of 3D point cloud geometry}, type = {article}, year = {2021}, keywords = {Context model,Deep learning,G-PCC,Point cloud compression}, pages = {4220-4224}, volume = {2021-June}, id = {b517e3b8-6bcf-318a-8c7f-b556bb84fd35}, created = {2022-01-11T11:30:16.910Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-11T11:30:21.121Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {9972d981-f25e-4229-94fb-1c4fc6296c30}, private_publication = {false}, abstract = {This paper presents a learning-based, lossless compression method for static point cloud geometry, based on context-adaptive arithmetic coding. Unlike most existing methods working in the octree domain, our encoder operates in a hybrid mode, mixing octree and voxel-based coding. We adaptively partition the point cloud into multi-resolution voxel blocks according to the point cloud structure, and use octree to signal the partitioning. On the one hand, octree representation can eliminate the sparsity in the point cloud. On the other hand, in the voxel domain, convolutions can be naturally expressed, and geometric information (i.e., planes, surfaces, etc.) is explicitly processed by a neural network. Our context model benefits from these properties and learns a probability distribution of the voxels using a deep convolutional neural network with masked filters, called VoxelDNN. Experiments show that our method outperforms the state-of-the-art MPEG G-PCC standard with average rate savings of 28% on a diverse set of point clouds from the Microsoft Voxelized Upper Bodies (MVUB) and MPEG. The implementation is available at https://github.com/Weafre/VoxelDNN.}, bibtype = {article}, author = {Nguyen, Dat Thanh and Quach, Maurice and Valenzise, Giuseppe and Duhamel, Pierre}, doi = {10.1109/ICASSP39728.2021.9414763}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {Point cloud classification with deep normalized Reeb graph convolution}, type = {article}, year = {2021}, keywords = {Graph normalization,Point cloud,Reeb graph}, pages = {104092}, volume = {106}, websites = {https://doi.org/10.1016/j.imavis.2020.104092}, publisher = {Elsevier B.V.}, id = {07fec0c5-97f6-3b4f-9413-6de62df9c445}, created = {2022-01-14T16:04:12.051Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:04.941Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2021}, private_publication = {false}, abstract = {Recently, plenty of deep learning methods have been proposed to handle point clouds. Almost all of them input the entire point cloud and ignore the information redundancy lying in point clouds. This paper addresses this problem by extracting the Reeb graph from point clouds, which is a much more informative and compact representation of point clouds, and then filter the graph with deep graph convolution. To be able to classify or segment point clouds well, we propose (1) Graph Normalization to transform various graphs into a canonical graph space; (2) Normalized Similarity Distance to better identify the graph structure;(3) Reeb Graph Guided Node Pooling in order to aggregate high-level features from kNN graphs. Besides, our method naturally fits into the problem of classifying point clouds with unknown orientations. In the results, we show that our method gives a competitive performance to the state-of-the-art methods and outperforms previous methods by a large margin on handling point clouds with unknown orientations.}, bibtype = {article}, author = {Wang, Weiming and You, Yang and Liu, Wenhai and Lu, Cewu}, doi = {10.1016/j.imavis.2020.104092}, journal = {Image and Vision Computing} }
@article{ title = {Graph Neural Networks with Convolutional ARMA Filters}, type = {article}, year = {2021}, keywords = {Chebyshev approximation,Convolution,Eigenvalues and eigenfunctions,Frequency response,Geometric Deep Learning,Graph Filters,Graph Neural Networks,Graph Signal Processing,Graph Theory,Graph neural networks,Laplace equations,Task analysis}, pages = {1-12}, volume = {8828}, id = {2d88d1ab-1a7e-318f-9ed3-0bf77faa8a37}, created = {2022-01-14T16:04:12.063Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:34.490Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Popular graph neural networks implement convolution operations on graphs based on polynomial spectral filters. In this paper, we propose a novel graph convolutional layer inspired by the auto-regressive moving average (ARMA) filter that, compared to polynomial ones, provides a more flexible frequency response, is more robust to noise, and better captures the global graph structure. We propose a graph neural network implementation of the ARMA filter with a recursive and distributed formulation, obtaining a convolutional layer that is efficient to train, localized in the node space, and can be transferred to new graphs at test time. We perform a spectral analysis to study the filtering effect of the proposed ARMA layer and report experiments on four downstream tasks: semi-supervised node classification, graph signal classification, graph classification, and graph regression. Results show that the proposed ARMA layer brings significant improvements over graph neural networks based on polynomial filters.}, bibtype = {article}, author = {Bianchi, Filippo Maria and Grattarola, Daniele and Livi, Lorenzo and Alippi, Cesare}, doi = {10.1109/TPAMI.2021.3054830}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {c} }
@article{ title = {On the stability of graph convolutional neural networks under edge rewiring}, type = {article}, year = {2021}, keywords = {Graph convolutional neural networks,Graph signal processing,Spectral graph filters,Stability}, pages = {8513-8517}, volume = {2021-June}, id = {c7e4d020-eff0-3ec3-8c05-72b666870e64}, created = {2022-01-14T16:04:12.174Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:42.785Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph neural networks are experiencing a surge of popularity within the machine learning community due to their ability to adapt to non-Euclidean domains and instil inductive biases. Despite this, their stability, i.e., their robustness to small perturbations in the input, is not yet well understood. Although there exists some results showing the stability of graph neural networks, most take the form of an upper bound on the magnitude of change due to a perturbation in the graph topology. However, the change in the graph topology captured in existing bounds tend not to be expressed in terms of structural properties, limiting our understanding of the model robustness properties. In this work, we develop an interpretable upper bound elucidating that graph neural networks are stable to rewiring between high degree nodes. This bound and further research in bounds of similar type provide further understanding of the stability properties of graph neural networks.}, bibtype = {article}, author = {Kenlay, Henry and Thanou, Dorina and Dong, Xiaowen}, doi = {10.1109/ICASSP39728.2021.9413474}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {Modelling and studying the effect of graph errors in graph signal processing}, type = {article}, year = {2021}, keywords = {Erdös-Rényi graphs,Error effect,Graph signal processing,Minimum distance index,Shift matrix}, pages = {108256}, volume = {189}, websites = {https://doi.org/10.1016/j.sigpro.2021.108256}, publisher = {Elsevier B.V.}, id = {20e656de-4259-391a-ac1e-fe2251dacd0a}, created = {2022-01-14T16:04:12.193Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:46.215Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The first step for any graph signal processing (GSP) procedure is to learn the graph signal representation, i.e., to capture the dependence structure of the data into an adjacency matrix. Indeed, the adjacency matrix is typically not known a priori and has to be learned. However, it is learned with errors. A little attention has been paid to modelling such errors in the adjacency matrix, and studying their effects on GSP methods. However, modelling errors in the adjacency matrix will enable both to study the graph error effects in GSP and to develop robust GSP algorithms. In this paper, we therefore introduce practically justifiable graph error models. We also study, both analytically when possible and numerically, the graph error effect on the performance of GSP methods in different types of problems such as filtering of graph signals and independent component analysis of graph signals (graph decorrelation).}, bibtype = {article}, author = {Miettinen, Jari and Vorobyov, Sergiy A. and Ollila, Esa}, doi = {10.1016/j.sigpro.2021.108256}, journal = {Signal Processing} }
@article{ title = {Interpretable Stability Bounds for Spectral Graph Filters}, type = {article}, year = {2021}, pages = {1-29}, websites = {http://arxiv.org/abs/2102.09587}, id = {ab01b562-656d-39fc-8fee-9a28720f6122}, created = {2022-01-14T16:04:12.211Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:43.984Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph-structured data arise in a variety of real-world context ranging from sensor and transportation to biological and social networks. As a ubiquitous tool to process graph-structured data, spectral graph filters have been used to solve common tasks such as denoising and anomaly detection, as well as design deep learning architectures such as graph neural networks. Despite being an important tool, there is a lack of theoretical understanding of the stability properties of spectral graph filters, which are important for designing robust machine learning models. In this paper, we study filter stability and provide a novel and interpretable upper bound on the change of filter output, where the bound is expressed in terms of the endpoint degrees of the deleted and newly added edges, as well as the spatial proximity of those edges. This upper bound allows us to reason, in terms of structural properties of the graph, when a spectral graph filter will be stable. We further perform extensive experiments to verify intuition that can be gained from the bound.}, bibtype = {article}, author = {Kenlay, Henry and Thanou, Dorina and Dong, Xiaowen}, number = {1997} }
@article{ title = {On the Stability of Low Pass Graph Filter With a Large Number of Edge Rewires}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2110.07234}, id = {fa6f31ad-27dd-3580-99d4-5ccd6fab4380}, created = {2022-01-14T16:04:12.248Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:47.084Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Recently, the stability of graph filters has been studied as one of the key theoretical properties driving the highly successful graph convolutional neural networks (GCNs). The stability of a graph filter characterizes the effect of topology perturbation on the output of a graph filter, a fundamental building block for GCNs. Many existing results have focused on the regime of small perturbation with a small number of edge rewires. However, the number of edge rewires can be large in many applications. To study the latter case, this work departs from the previous analysis and proves a bound on the stability of graph filter relying on the filter's frequency response. Assuming the graph filter is low pass, we show that the stability of the filter depends on perturbation to the community structure. As an application, we show that for stochastic block model graphs, the graph filter distance converges to zero when the number of nodes approaches infinity. Numerical simulations validate our findings.}, bibtype = {article}, author = {Nguyen, Hoang-Son and He, Yiran and Wai, Hoi-To} }
@article{ title = {Point Cloud Processing based on Graph Neural Network}, type = {article}, year = {2021}, id = {0e2e1132-4d51-3996-9ec6-85ab48424ec5}, created = {2022-01-14T16:04:12.318Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.418Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Lin, Manxi} }
@article{ title = {Graph convolutional networks for graphs containing missing features}, type = {article}, year = {2021}, keywords = {GCN,Graph convolutional neural network,Graph embedding,Incomplete data,Missing data,Network representation learning}, pages = {155-168}, volume = {117}, websites = {https://doi.org/10.1016/j.future.2020.11.016}, publisher = {Elsevier B.V.}, id = {f54f1ccb-aa23-3a4c-b813-3957cbb2ddb8}, created = {2022-01-14T16:04:12.422Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.308Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph Convolutional Network (GCN) has experienced great success in graph analysis tasks. It works by smoothing the node features across the graph. The current GCN models overwhelmingly assume that the node feature information is complete. However, real-world graph data are often incomplete and containing missing features. Traditionally, people have to estimate and fill in the unknown features based on imputation techniques and then apply GCN. However, the process of feature filling and graph learning are separated, resulting in degraded and unstable performance. This problem becomes more serious when a large number of features are missing. We propose an approach that adapts GCN to graphs containing missing features. In contrast to traditional strategy, our approach integrates the processing of missing features and graph learning within the same neural network architecture. Our idea is to represent the missing data by Gaussian Mixture Model (GMM) and calculate the expected activation of neurons in the first hidden layer of GCN, while keeping the other layers of the network unchanged. This enables us to learn the GMM parameters and network weight parameters in an end-to-end manner. Notably, our approach does not increase the computational complexity of GCN and it is consistent with GCN when the features are complete. We demonstrate through extensive experiments that our approach significantly outperforms the imputation based methods in node classification and link prediction tasks. We show that the performance of our approach for the case with a low level of missing features is even superior to GCN for the case with complete features.}, bibtype = {article}, author = {Taguchi, Hibiki and Liu, Xin and Murata, Tsuyoshi}, doi = {10.1016/j.future.2020.11.016}, journal = {Future Generation Computer Systems} }
@article{ title = {Structural Features In Feature Space For Structure-Aware Graph Convolution}, type = {article}, year = {2021}, pages = {3158-3162}, publisher = {IEEE}, id = {ce40d005-9b5f-367b-8dc4-adf1851ce165}, created = {2022-01-14T16:04:12.451Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.483Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Li, Yang and Tanaka, Yuichi}, doi = {10.1109/icip42928.2021.9506377} }
@article{ title = {A weakly supervised framework for real-world point cloud classification}, type = {article}, year = {2021}, keywords = {point cloud classification,real-world point cloud,weakly supervised learning}, pages = {78-88}, volume = {102}, websites = {https://doi.org/10.1016/j.cag.2021.12.008}, publisher = {Elsevier Ltd.}, id = {b869a32a-adb8-300c-a173-9f4e6b378107}, created = {2022-01-14T16:04:12.451Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.433Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Deng, An and Wu, Yunchao and Zhang, Peng and Lu, Zhuheng and Li, Weiqing and Su, Zhiyong}, doi = {10.1016/j.cag.2021.12.008}, journal = {Computers & Graphics} }
@article{ title = {Beyond Farthest Point Sampling in Point-Wise Analysis}, type = {article}, year = {2021}, pages = {1-12}, websites = {http://arxiv.org/abs/2107.04291}, id = {3ea300b5-8924-346c-8eb4-3cb8a2be9275}, created = {2022-01-14T16:04:12.467Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.488Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Sampling, grouping, and aggregation are three important components in the multi-scale analysis of point clouds. In this paper, we present a novel data-driven sampler learning strategy for point-wise analysis tasks. Unlike the widely used sampling technique, Farthest Point Sampling (FPS), we propose to learn sampling and downstream applications jointly. Our key insight is that uniform sampling methods like FPS are not always optimal for different tasks: sampling more points around boundary areas can make the point-wise classification easier for segmentation. Towards the end, we propose a novel sampler learning strategy that learns sampling point displacement supervised by task-related ground truth information and can be trained jointly with the underlying tasks. We further demonstrate our methods in various point-wise analysis architectures, including semantic part segmentation, point cloud completion, and keypoint detection. Our experiments show that jointly learning of the sampler and task brings remarkable improvement over previous baseline methods.}, bibtype = {article}, author = {Lin, Yiqun and Chen, Lichang and Huang, Haibin and Ma, Chongyang and Han, Xiaoguang and Cui, Shuguang} }
@article{ title = {HISADOME, MATSUI: CASCADING FEATURE EXTRACTION FOR 3D REGISTRATION Cascading Feature Extraction for Fast Point Cloud Registration}, type = {article}, year = {2021}, id = {987e36b8-78c6-3227-9fe7-a80f70b1cc46}, created = {2022-01-18T13:44:08.877Z}, accessed = {2022-01-18}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-18T13:44:12.563Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We propose a method for speeding up a 3D point cloud registration through a cascading feature extraction. The current approach with the highest accuracy is realized by iteratively executing feature extraction and registration using deep features. However, iterative feature extraction takes time. Our proposed method significantly reduces the computational cost using cascading shallow layers. Our idea is to omit redundant computations that do not always contribute to the final accuracy. The proposed approach is approximately three times faster than the existing methods without a loss of accuracy.}, bibtype = {article}, author = {Hisadome, Yoichiro and Matsui, Yusuke} }
@article{ title = {A novel geometry image to accurately represent a surface by preserving mesh topology}, type = {article}, year = {2021}, pages = {1-9}, volume = {11}, websites = {https://doi.org/10.1038/s41598-021-01722-4}, publisher = {Nature Publishing Group UK}, id = {388c7bf8-6c05-3d9b-9295-c851e059e2b7}, created = {2022-01-25T12:51:06.329Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.563Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zeng2021}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {Geometry images parameterise a mesh with a square domain and store the information in a single chart. A one-to-one correspondence between the 2D plane and the 3D model is convenient for processing 3D models. However, the parameterised vertices are not all located at the intersection of the gridlines the existing geometry images. Thus, errors are unavoidable when a 3D mesh is reconstructed from the chart. In this paper, we propose parameterise surface onto a novel geometry image that preserves the constraint of topological neighbourhood information at integer coordinate points on a 2D grid and ensures that the shape of the reconstructed 3D mesh does not change from supplemented image data. We find a collection of edges that opens the mesh into simply connected surface with a single boundary. The point distribution with approximate blue noise spectral characteristics is computed by capacity-constrained delaunay triangulation without retriangulation. We move the vertices to the constrained mesh intersection, adjust the degenerate triangles on a regular grid, and fill the blank part by performing a local affine transformation between each triangle in the mesh and image. Unlike other geometry images, the proposed method results in no error in the reconstructed surface model when floating-point data are stored in the image. High reconstruction accuracy is achieved when the xyz positions are in a 16-bit data format in each image channel because only rounding errors exist in the topology-preserving geometry images, there are no sampling errors. This method performs one-to-one mapping between the 3D surface mesh and the points in the 2D image, while foldovers do not appear in the 2D triangular mesh, maintaining the topological structure. This also shows the potential of using a 2D image processing algorithm to process 3D models.}, bibtype = {article}, author = {Zeng, Sheng and Geng, Guohua and Gao, Hongjuan and Zhou, Mingquan}, doi = {10.1038/s41598-021-01722-4}, journal = {Scientific Reports}, number = {1} }
@article{ title = {Point Cloud Classification Algorithm Based on the Fusion of the Local Binary Pattern Features and Structural Features of VoxelsLi, Y., Luo, Y., Gu, X., Chen, D., Gao, F., & Shuang, F. (2021). Point Cloud Classification Algorithm Based on the Fusion of the}, type = {article}, year = {2021}, keywords = {classification,local binary pattern,point cloud,voxelization}, pages = {3156}, volume = {13}, websites = {https://www.mdpi.com/2072-4292/13/16/3156/htm,https://www.mdpi.com/2072-4292/13/16/3156}, month = {8}, publisher = {Multidisciplinary Digital Publishing Institute}, day = {10}, id = {b2cabd10-7279-3df7-a58e-23f24f1962c9}, created = {2022-01-27T08:28:39.553Z}, accessed = {2022-01-27}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-27T08:36:57.492Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Point cloud classification is a key technology for point cloud applications and point cloud feature extraction is a key step towards achieving point cloud classification. Although there are many point cloud feature extraction and classification methods, and the acquisition of colored point cloud data has become easier in recent years, most point cloud processing algorithms do not consider the color information associated with the point cloud or do not make full use of the color information. Therefore, we propose a voxel-based local feature descriptor according to the voxel-based local binary pattern (VLBP) and fuses point cloud RGB information and geometric structure features using a random forest classifier to build a color point cloud classification algorithm. The proposed algorithm voxelizes the point cloud; divides the neighborhood of the center point into cubes (i.e., multiple adjacent sub-voxels); compares the gray information of the voxel center and adjacent sub-voxels; performs voxel global thresholding to convert it into a binary code; and uses a local difference sign–magnitude transform (LDSMT) to decompose the local difference of an entire voxel into two complementary components of sign and magnitude. Then, the VLBP feature of each point is extracted. To obtain more structural information about the point cloud, the proposed method extracts the normal vector of each point and the corresponding fast point feature histogram (FPFH) based on the normal vector. Finally, the geometric mechanism features (normal vector and FPFH) and color features (RGB and VLBP features) of the point cloud are fused, and a random forest classifier is used to classify the color laser point cloud. The experimental results show that the proposed algorithm can achieve effective point cloud classification for point cloud data from different indoor and outdoor scenes, and the proposed VLBP features can improve the accuracy of point cloud classification.}, bibtype = {article}, author = {Li, Yong and Luo, Yinzheng and Gu, Xia and Chen, Dong and Gao, Fang and Shuang, Feng}, doi = {10.3390/RS13163156}, journal = {Remote Sensing 2021, Vol. 13, Page 3156}, number = {16} }
@article{ title = {Point Cloud Classification Algorithm Based on the Fusion of the Local Binary Pattern Features and Structural Features of Voxels}, type = {article}, year = {2021}, keywords = {classification,local binary pattern,point cloud,voxelization}, pages = {3156}, volume = {13}, websites = {https://www.mdpi.com/2072-4292/13/16/3156/htm,https://www.mdpi.com/2072-4292/13/16/3156}, month = {8}, publisher = {Multidisciplinary Digital Publishing Institute}, day = {10}, id = {38167400-5c02-3ba6-af8f-22ec4d841776}, created = {2022-01-27T08:36:17.922Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-28T07:27:10.300Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Point cloud classification is a key technology for point cloud applications and point cloud feature extraction is a key step towards achieving point cloud classification. Although there are many point cloud feature extraction and classification methods, and the acquisition of colored point cloud data has become easier in recent years, most point cloud processing algorithms do not consider the color information associated with the point cloud or do not make full use of the color information. Therefore, we propose a voxel-based local feature descriptor according to the voxel-based local binary pattern (VLBP) and fuses point cloud RGB information and geometric structure features using a random forest classifier to build a color point cloud classification algorithm. The proposed algorithm voxelizes the point cloud; divides the neighborhood of the center point into cubes (i.e., multiple adjacent sub-voxels); compares the gray information of the voxel center and adjacent sub-voxels; performs voxel global thresholding to convert it into a binary code; and uses a local difference sign–magnitude transform (LDSMT) to decompose the local difference of an entire voxel into two complementary components of sign and magnitude. Then, the VLBP feature of each point is extracted. To obtain more structural information about the point cloud, the proposed method extracts the normal vector of each point and the corresponding fast point feature histogram (FPFH) based on the normal vector. Finally, the geometric mechanism features (normal vector and FPFH) and color features (RGB and VLBP features) of the point cloud are fused, and a random forest classifier is used to classify the color laser point cloud. The experimental results show that the proposed algorithm can achieve effective point cloud classification for point cloud data from different indoor and outdoor scenes, and the proposed VLBP features can improve the accuracy of point cloud classification.}, bibtype = {article}, author = {Li, Yong and Luo, Yinzheng and Gu, Xia and Chen, Dong and Gao, Fang and Shuang, Feng}, doi = {10.3390/RS13163156}, journal = {Remote Sensing 2021, Vol. 13, Page 3156}, number = {16} }
@article{ title = {CoFiNet: Reliable Coarse-to-fine Correspondences for Robust Point Cloud Registration}, type = {article}, year = {2021}, pages = {1-18}, websites = {http://arxiv.org/abs/2110.14076}, id = {863ecfba-ba3e-3035-bdd4-d0a6e4a41813}, created = {2022-02-03T11:00:45.137Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-01-10T11:25:26.167Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {We study the problem of extracting correspondences between a pair of point clouds for registration. For correspondence retrieval, existing works benefit from matching sparse keypoints detected from dense points but usually struggle to guarantee their repeatability. To address this issue, we present CoFiNet - Coarse-to-Fine Network which extracts hierarchical correspondences from coarse to fine without keypoint detection. On a coarse scale and guided by a weighting scheme, our model firstly learns to match down-sampled nodes whose vicinity points share more overlap, which significantly shrinks the search space of a consecutive stage. On a finer scale, node proposals are consecutively expanded to patches that consist of groups of points together with associated descriptors. Point correspondences are then refined from the overlap areas of corresponding patches, by a density-adaptive matching module capable to deal with varying point density. Extensive evaluation of CoFiNet on both indoor and outdoor standard benchmarks shows our superiority over existing methods. Especially on 3DLoMatch where point clouds share less overlap, CoFiNet significantly outperforms state-of-the-art approaches by at least 5% on Registration Recall, with at most two-third of their parameters.}, bibtype = {article}, author = {Yu, Hao and Li, Fu and Saleh, Mahdi and Busam, Benjamin and Ilic, Slobodan}, number = {NeurIPS} }
@article{ title = {PREDATOR: Registration of 3D Point Clouds with Low Overlap}, type = {article}, year = {2021}, pages = {4265-4274}, id = {eea6d683-a934-3d66-984e-7f7abbdba69a}, created = {2022-02-03T11:00:45.143Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-03T11:00:56.149Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {We introduce PREDATOR, a model for pairwise point-cloud registration with deep attention to the overlap region. Different from previous work, our model is specifically designed to handle (also) point-cloud pairs with low overlap. Its key novelty is an overlap-attention block for early information exchange between the latent encodings of the two point clouds. In this way the subsequent decoding of the latent representations into per-point features is conditioned on the respective other point cloud, and thus can predict which points are not only salient, but also lie in the overlap region between the two point clouds. The ability to focus on points that are relevant for matching greatly improves performance: PREDATOR raises the rate of successful registrations by more than 20% in the low-overlap scenario, and also sets a new state of the art for the 3DMatch benchmark with 89% registration recall. [Code release].}, bibtype = {article}, author = {Huang, Shengyu and Gojcic, Zan and Usvyatsov, Mikhail and Wieser, Andreas and Schindler, Konrad}, doi = {10.1109/CVPR46437.2021.00425}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Learning of 3D Graph Convolution Networks for Point Cloud Analysis}, type = {article}, year = {2021}, keywords = {3D Classification,3D Segmentation,3D vision,Convolution,Deformable Kernels,Feature extraction,Graph Convolution Networks,Kernel,Point Clouds,Shape,Task analysis,Three-dimensional displays,Two dimensional displays}, volume = {X}, id = {022a2dac-6b5b-337e-aa77-aff6b65b7b4e}, created = {2022-02-15T11:01:25.914Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-15T11:03:54.102Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8315fdc0-e3a9-47f0-9186-21b3433d86d2}, private_publication = {false}, abstract = {Point clouds are among the popular geometry representations in 3D vision. However, unlike 2D images with pixel-wise layouts, such representations containing unordered data points which make the processing and understanding the associated semantic information quite challenging. Although a number of previous works attempt to analyze point clouds and achieve promising performances, their performances would degrade significantly when data variations like shift and scale changes are presented. In this paper, we propose 3D Graph Convolution Networks (3D-GCN), which uniquely learns 3D kernels with graph max-pooling mechanisms for extracting geometric features from point cloud data across different scales. We show that, with the proposed 3D-GCN, satisfactory shift and scale invariance can be jointly achieved. We show that 3D-GCN can be applied to point cloud classification and segmentation tasks, with ablation studies and visualizations verifying the design of 3D-GCN.}, bibtype = {article}, author = {Lin, Zhi Hao and Huang, Sheng Yu and Wang, Yu Chiang Frank}, doi = {10.1109/TPAMI.2021.3059758}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {X} }
@article{ title = {Sensor agnostic semantic segmentation of structurally diverse and complex forest point clouds using deep learning}, type = {article}, year = {2021}, keywords = {Automated,Deep learning,Digital terrain model,Forest,LiDAR,Photogrammetry,Point cloud,Segmentation,Structure from motion,Terrestrial laser scanning}, volume = {13}, id = {9af8f017-b62d-3154-b8b8-e6f579fbc4ef}, created = {2022-02-21T06:44:13.997Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T12:22:24.332Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {Forest inventories play an important role in enabling informed decisions to be made for the management and conservation of forest resources; however, the process of collecting inventory information is laborious. Despite advancements in mapping technologies allowing forests to be digitized in finer granularity than ever before, it is still common for forest measurements to be collected using simple tools such as calipers, measuring tapes, and hypsometers. Dense understory vegetation and complex forest structures can present substantial challenges to point cloud processing tools, often leading to erroneous measurements, and making them of less utility in complex forests. To address this challenge, this research demonstrates an effective deep learning approach for seman-tically segmenting high-resolution forest point clouds from multiple different sensing systems in diverse forest conditions. Seven diverse point cloud datasets were manually segmented to train and evaluate this model, resulting in per-class segmentation accuracies of Terrain: 95.92%, Vegetation: 96.02%, Coarse Woody Debris: 54.98%, and Stem: 96.09%. By exploiting the segmented point cloud, we also present a method of extracting a Digital Terrain Model (DTM) from such segmented point clouds. This approach was applied to a set of six point clouds that were made publicly available as part of a benchmarking study to evaluate the DTM performance. The mean DTM error was 0.04 m relative to the reference with 99.9% completeness. These approaches serve as useful steps toward a fully automated and reliable measurement extraction tool, agnostic to the sensing technology used or the complexity of the forest, provided that the point cloud has sufficient coverage and accuracy. Ongoing work will see these models incorporated into a fully automated forest measurement tool for the extraction of structural metrics for applications in forestry, conservation, and research.}, bibtype = {article}, author = {Krisanski, Sean and Taskhiri, Mohammad Sadegh and Aracil, Susana Gonzalez and Herries, David and Turner, Paul}, doi = {10.3390/rs13081413}, journal = {Remote Sensing}, number = {8} }
@article{ title = {Point Set Voting for Partial Point Cloud Analysis}, type = {article}, year = {2021}, keywords = {Deep learning methods}, pages = {596-603}, volume = {6}, id = {6659abf4-1156-3071-89a0-505868e87084}, created = {2022-02-21T06:44:14.251Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T12:22:24.443Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {The continual improvement of 3D sensors has driven the development of algorithms to perform point cloud analysis. In fact, techniques for point cloud classification and segmentation have in recent years achieved incredible performance driven in part by leveraging large synthetic datasets. Unfortunately these same state-of-The-Art approaches perform poorly when applied to incomplete point clouds. This limitation of existing algorithms is particularly concerning since point clouds generated by 3D sensors in the real world are usually incomplete due to perspective view or occlusion by other objects. This paper proposes a general model for partial point clouds analysis wherein the latent feature encoding a complete point cloud is inferred by applying a point set voting strategy. In particular, each local point set constructs a vote that corresponds to a distribution in the latent space, and the optimal latent feature is the one with the highest probability. This approach ensures that any subsequent point cloud analysis is robust to partial observation while simultaneously guaranteeing that the proposed model is able to output multiple possible results. This paper illustrates that this proposed method achieves the state-of-The-Art performance on shape classification, part segmentation and point cloud completion.}, bibtype = {article}, author = {Zhang, Junming and Chen, Weijia and Wang, Yuping and Vasudevan, Ram and Johnson-Roberson, Matthew}, doi = {10.1109/LRA.2020.3048658}, journal = {IEEE Robotics and Automation Letters}, number = {2} }
@article{ title = {Wireless 3D Point Cloud Delivery Using Deep Graph Neural Networks}, type = {article}, year = {2021}, keywords = {Point cloud,deep graph neural network}, pages = {1-6}, id = {45a05c6a-37e0-38c6-ba8b-0cf8147cd384}, created = {2022-02-21T06:44:14.389Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T07:18:43.492Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449,e523c16b-0594-4b52-9c4e-9052fcb9dbed,a6e140dd-a959-4148-9ed8-b29d0c7966c6}, private_publication = {false}, abstract = {In typical point cloud delivery, a sender uses octree-based and graph-based digital video compression to send three-dimensional (3D) points and color attributes. However, the digital-based schemes have an issue called the cliff effect, where the 3D reconstruction quality will be a step function in terms of wireless channel quality. To prevent the cliff effect subject to channel quality fluctuation, we have proposed a wireless point cloud delivery called HoloCast inspired by soft delivery. Although the HoloCast realizes graceful quality improvement according to instantaneous wireless channel quality, it requires large communication overheads. In this paper, we propose a novel scheme for soft point cloud delivery to simultaneously realize better 3D reconstruction quality and lower communication overheads. The proposed scheme introduces an end-to-end deep learning framework based on graph neural network (GNN) to reconstruct high-quality point clouds from its distorted observation under wireless fading channels. We demonstrate that the proposed GNN-based scheme can reconstruct a clean 3D point cloud with low overheads by removing fading and noise effects.}, bibtype = {article}, author = {Fujihashi, Takuya and Koike-Akino, Toshiaki and Chen, Siheng and Watanabe, Takashi}, doi = {10.1109/ICC42927.2021.9500925}, journal = {IEEE International Conference on Communications} }
@article{ title = {Point2color: 3D point cloud colorization using a conditional generative network and differentiable rendering for airborne LiDAR}, type = {article}, year = {2021}, pages = {1062-1071}, id = {fc6ec6e7-ec07-32aa-a371-8a00eef51ce0}, created = {2022-02-21T06:44:14.573Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T09:08:42.725Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {Airborne LiDAR observations are very effective for providing accurate 3D point clouds, and archived data are becoming available to the public. In many cases, only geometric information is available in the published 3D point cloud observed by airborne LiDAR (airborne 3D point cloud), and geometric information alone is not readable. Thus, it is important to colorize airborne 3D point clouds to improve visual readability. A scheme for 3D point cloud colorization using a conditional generative adversarial network (cGAN) was proposed, but it is difficult to apply to airborne LiDAR because the method is for artificial CAD models. Since airborne 3D point clouds are spread over a wider area than simple CAD models, it is important to evaluate them spatially in two-dimensional (2D) images. Currently, the differentiable renderer is the most reliable method to bridge 3D and 2D images. In this paper, we propose an airborne 3D point cloud colorization scheme called point2color using cGAN with points and rendered images. To achieve airborne 3D point cloud colorization, we estimate the color of each point with PointNet++ and render the estimated colored airborne 3D point cloud into a 2D image with a differentiable renderer. The network is then trained by minimizing the distance between real color and colorized fake color. The experimental results demonstrate the effectiveness of point2color using the IEEE GRSS 2018 Data Fusion Contest dataset with lower error than previous studies. Furthermore, an ablation study demonstrates the effectiveness of using a cGAN pipeline and 2D images via a differentiable renderer. Our code will be available at GitHub.}, bibtype = {article}, author = {Shinohara, Takayuki and Xiu, Haoyi and Matsuoka, Masashi}, doi = {10.1109/CVPRW53098.2021.00117}, journal = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops} }
@article{ title = {Wireless 3D Point Cloud Delivery Using Deep Graph Neural Networks}, type = {article}, year = {2021}, keywords = {Point cloud,deep graph neural network}, pages = {0-5}, publisher = {IEEE}, id = {ff072634-7be1-350f-8f1f-a1b7c6f1eb5a}, created = {2022-02-23T06:26:16.728Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T07:18:43.225Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449,e523c16b-0594-4b52-9c4e-9052fcb9dbed,a6e140dd-a959-4148-9ed8-b29d0c7966c6}, private_publication = {false}, abstract = {In typical point cloud delivery, a sender uses octree-based and graph-based digital video compression to send three-dimensional (3D) points and color attributes. However, the digital-based schemes have an issue called the cliff effect, where the 3D reconstruction quality will be a step function in terms of wireless channel quality. To prevent the cliff effect subject to channel quality fluctuation, we have proposed a wireless point cloud delivery called HoloCast inspired by soft delivery. Although the HoloCast realizes graceful quality improvement according to instantaneous wireless channel quality, it requires large communication overheads. In this paper, we propose a novel scheme for soft point cloud delivery to simultaneously realize better 3D reconstruction quality and lower communication overheads. The proposed scheme introduces an end-to-end deep learning framework based on graph neural network (GNN) to reconstruct high-quality point clouds from its distorted observation under wireless fading channels. We demonstrate that the proposed GNN-based scheme can reconstruct a clean 3D point cloud with low overheads by removing fading and noise effects.}, bibtype = {article}, author = {Fujihashi, Takuya and Koike-Akino, Toshiaki and Chen, Siheng and Watanabe, Takashi}, doi = {10.1109/ICC42927.2021.9500925}, journal = {IEEE International Conference on Communications} }
@article{ title = {BuildingNet: Learning to Label 3D Buildings}, type = {article}, year = {2021}, pages = {10397-10407}, websites = {http://arxiv.org/abs/2110.04955}, id = {fb3721b1-5df7-34e0-b70c-a5ce05679cc8}, created = {2022-02-24T07:44:06.302Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-24T07:44:16.109Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {We introduce BuildingNet: (a) a large-scale dataset of 3D building models whose exteriors are consistently labeled, (b) a graph neural network that labels building meshes by analyzing spatial and structural relations of their geometric primitives. To create our dataset, we used crowdsourcing combined with expert guidance, resulting in 513K annotated mesh primitives, grouped into 292K semantic part components across 2K building models. The dataset covers several building categories, such as houses, churches, skyscrapers, town halls, libraries, and castles. We include a benchmark for evaluating mesh and point cloud labeling. Buildings have more challenging structural complexity compared to objects in existing benchmarks (e.g., ShapeNet, PartNet), thus, we hope that our dataset can nurture the development of algorithms that are able to cope with such large-scale geometric data for both vision and graphics tasks e.g., 3D semantic segmentation, part-based generative models, correspondences, texturing, and analysis of point cloud data acquired from real-world buildings. Finally, we show that our mesh-based graph neural network significantly improves performance over several baselines for labeling 3D meshes.}, bibtype = {article}, author = {Selvaraju, Pratheba and Nabail, Mohamed and Loizou, Marios and Maslioukova, Maria and Averkiou, Melinos and Andreou, Andreas and Chaudhuri, Siddhartha and Kalogerakis, Evangelos} }
@article{ title = {An Overview on the Application of Graph Neural Networks in Wireless Networks}, type = {article}, year = {2021}, keywords = {Wireless networks,graph neural networks,resource management}, pages = {2547-2565}, volume = {2}, publisher = {IEEE}, id = {2bfff17f-4ce7-34d5-bbd6-13bdfdef956a}, created = {2022-03-01T12:39:38.016Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T07:52:11.588Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {In recent years, with the rapid enhancement of computing power, deep learning methods have been widely applied in wireless networks and achieved impressive performance. To effectively exploit the information of graph-structured data as well as contextual information, graph neural networks (GNNs) have been introduced to address a series of optimization problems of wireless networks. In this overview, we first illustrate the construction method of wireless communication graph for various wireless networks and simply introduce the progress of several classical paradigms of GNNs. Then, several applications of GNNs in wireless networks such as resource allocation and several emerging fields, are discussed in detail. Finally, some research trends about the applications of GNNs in wireless communication systems are discussed.}, bibtype = {article}, author = {He, Shiwen and Xiong, Shaowen and Ou, Yeyu and Zhang, Jian and Wang, Jiaheng and Huang, Yongming and Zhang, Yaoxue}, doi = {10.1109/OJCOMS.2021.3128637}, journal = {IEEE Open Journal of the Communications Society}, number = {November} }
@article{ title = {Graph Neural Networks with Parallel Neighborhood Aggregations for Graph Classification}, type = {article}, year = {2021}, pages = {1-20}, websites = {http://arxiv.org/abs/2111.11482}, id = {89dcaa2c-c6dd-3c34-944d-aa35414abfaa}, created = {2022-03-01T12:39:38.070Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-01T12:39:42.792Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {We focus on graph classification using a graph neural network (GNN) model that precomputes the node features using a bank of neighborhood aggregation graph operators arranged in parallel. These GNN models have a natural advantage of reduced training and inference time due to the precomputations but are also fundamentally different from popular GNN variants that update node features through a sequential neighborhood aggregation procedure during training. We provide theoretical conditions under which a generic GNN model with parallel neighborhood aggregations (PA-GNNs, in short) are provably as powerful as the well-known Weisfeiler-Lehman (WL) graph isomorphism test in discriminating non-isomorphic graphs. Although PA-GNN models do not have an apparent relationship with the WL test, we show that the graph embeddings obtained from these two methods are injectively related. We then propose a specialized PA-GNN model, called SPIN, which obeys the developed conditions. We demonstrate via numerical experiments that the developed model achieves state-of-the-art performance on many diverse real-world datasets while maintaining the discriminative power of the WL test and the computational advantage of preprocessing graphs before the training process.}, bibtype = {article}, author = {Doshi, Siddhant and Chepuri, Sundeep Prabhakar} }
@article{ title = {HoloCast+: Hybrid Digital-Analog Transmission for Graceful Point Cloud Delivery with Graph Fourier Transform}, type = {article}, year = {2021}, keywords = {Decorrelation,Distortion,Encoding,Graph Signal Processing,Hybrid Digital-Analog Coding,Image reconstruction,Point Cloud,Receivers,Three-dimensional displays,Wireless Transmission,Wireless communication}, pages = {1-13}, id = {25b68138-b0b3-3dd0-97da-f19236401608}, created = {2022-03-02T07:02:50.312Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T07:03:02.516Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {Point cloud is an emerging data format useful for various applications such has holographic display, autonomous vehicle, and augmented reality. Conventionally, communications of point cloud data have relied on digital compression and digital modulation for three-dimensional (3D) data streaming. However, such digital-based delivery schemes have fundamental issues called cliff and leveling effects, where the 3D reconstruction quality is a step function in terms of wireless channel quality. We propose a novel scheme of point cloud delivery, called HoloCast+, to overcome cliff and leveling effects. Specifically, our method utilizes hybrid digital-analog coding, integrating digital compression and analog coding based on graph Fourier transform (GFT), to gracefully improve 3D reconstruction quality with the improvement of channel quality. We demonstrate that HoloCast+ offers better 3D reconstruction quality in terms of the symmetric mean square error (sMSE) by up to 18.3 dB and 10.5 dB, respectively, compared to conventional digital-based and analog-based delivery methods in wireless fading environments.}, bibtype = {article}, author = {Fujihashi, Takuya and Koike-Akino, Toshiaki and Watanabe, Takashi and Orlik, Philip V.}, doi = {10.1109/TMM.2021.3077772}, journal = {IEEE Transactions on Multimedia} }
@article{ title = {Advanced Geometry Surface Coding for Dynamic Point Cloud Compression}, type = {article}, year = {2021}, pages = {1-10}, websites = {http://arxiv.org/abs/2103.06549}, id = {e8c252bb-c036-35a0-9a97-5f5f9c955ad0}, created = {2022-03-02T07:02:50.313Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T09:31:16.710Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {In video-based dynamic point cloud compression (V-PCC), 3D point clouds are projected onto 2D images for compressing with the existing video codecs. However, the existing video codecs are originally designed for natural visual signals, and it fails to account for the characteristics of point clouds. Thus, there are still problems in the compression of geometry information generated from the point clouds. Firstly, the distortion model in the existing rate-distortion optimization (RDO) is not consistent with the geometry quality assessment metrics. Secondly, the prediction methods in video codecs fail to account for the fact that the highest depth values of a far layer is greater than or equal to the corresponding lowest depth values of a near layer. This paper proposes an advanced geometry surface coding (AGSC) method for dynamic point clouds (DPC) compression. The proposed method consists of two modules, including an error projection model-based (EPM-based) RDO and an occupancy map-based (OM-based) merge prediction. Firstly, the EPM model is proposed to describe the relationship between the distortion model in the existing video codec and the geometry quality metric. Secondly, the EPM-based RDO method is presented to project the existing distortion model on the plane normal and is simplified to estimate the average normal vectors of coding units (CUs). Finally, we propose the OM-based merge prediction approach, in which the prediction pixels of merge modes are refined based on the occupancy map. Experiments tested on the standard point clouds show that the proposed method achieves an average 9.84\% bitrate saving for geometry compression.}, bibtype = {article}, author = {Xiong, Jian and Gao, Hao and Wang, Miaohui and Li, Hongliang and Ngan, King Ngi and Lin, Weisi} }
@article{ title = {AR Anchor System Using Mobile Based 3D GNN Detection}, type = {article}, year = {2021}, keywords = {3d object detection,augmented reality,deep learning,gnn,point cloud,smart phone}, pages = {54-60}, volume = {13}, id = {db9ff35d-c764-3fc8-ba97-efd5fbfcbcff}, created = {2022-03-02T07:52:11.253Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T09:33:30.101Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {AR (Augmented Reality) is a technology that provides virtual content to the real world and provides additional information to objects in real-time through 3D content. In the past, a high-performance device was required to experience AR, but it was possible to implement AR more easily by improving mobile performance and mounting various sensors such as ToF (Time-of-Flight). Also, the importance of mobile augmented reality is growing with the commercialization of high-speed wireless Internet such as 5G. Thus, this paper proposes a system that can provide AR services via GNN (Graph Neural Network) using cameras and sensors on mobile devices. ToF of mobile devices is used to capture depth maps. A 3D point cloud was created using RGB images to distinguish specific colors of objects. Point clouds created with RGB images and Depth Map perform downsampling for smooth communication between mobile and server. Point clouds sent to the server are used for 3D object detection. The detection process determines the class of objects and uses one point in the 3D bounding box as an anchor point. AR contents are provided through app and web through class and anchor of the detected object. PU - The Institute of Internet, Broadcasting and Communication}, bibtype = {article}, author = {University, Graduate School of Smart Convergence, Kwangwoon and Kim, Jun-Sik and Kim, Dong-Kyun and CHUL, KWON SOON and Jung, Kye-Dong}, journal = {The International Journal of Internet, Broadcasting and Communication}, number = {1} }
@article{ title = {Branchy-GNN: A device-edge co-inference framework for efficient point cloud processing}, type = {article}, year = {2021}, keywords = {Edge inference,Graph neural network (GNN),Joint source-channel coding (JSCC),Point cloud}, pages = {8488-8492}, volume = {2021-June}, id = {0f1c9ca4-08dc-3efa-a7df-14a4986f90f9}, created = {2022-03-02T07:52:11.254Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T09:33:30.114Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {The recent advancements of three-dimensional (3D) data acquisition devices have spurred a new breed of applications that rely on point cloud data processing. However, processing a large volume of point cloud data brings a significant workload on resource-constrained mobile devices, prohibiting from unleashing their full potentials. Built upon the emerging paradigm of device-edge co-inference, where an edge device extracts and transmits the intermediate feature to an edge server for further processing, we propose Branchy-GNN for efficient graph neural network (GNN) based point cloud processing by leveraging edge computing platforms. In order to reduce the on-device computational cost, the Branchy-GNN adds branch networks for early exiting. Besides, it employs learning-based joint source-channel coding (JSCC) for the intermediate feature compression to reduce the communication overhead. Our experimental results demonstrate that the proposed Branchy-GNN secures a significant latency reduction compared with several benchmark methods.}, bibtype = {article}, author = {Shao, Jiawei and Zhang, Haowei and Mao, Yuyi and Zhang, Jun}, doi = {10.1109/ICASSP39728.2021.9414831}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {Split Computing and Early Exiting for Deep Learning Applications: Survey and Research Challenges}, type = {article}, year = {2021}, keywords = {Split Computing, Edge Computing, Early Exit, Neura}, volume = {37}, websites = {http://arxiv.org/abs/2103.04505}, publisher = {Association for Computing Machinery}, id = {9642a144-c4ee-3757-a6c7-5d5769144cad}, created = {2022-03-09T07:18:42.318Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-11T00:43:43.376Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449,a6e140dd-a959-4148-9ed8-b29d0c7966c6}, private_publication = {false}, abstract = {Mobile devices such as smartphones and autonomous vehicles increasingly rely on deep neural networks (DNNs) to execute complex inference tasks such as image classification and speech recognition, among others. However, continuously executing the entire DNN on the mobile device can quickly deplete its battery. Although task offloading to cloud/edge servers may decrease the mobile device's computational burden, erratic patterns in channel quality, network, and edge server load can lead to a significant delay in task execution. Recently, approaches based on split computing (SC) have been proposed, where the DNN is split into a head and a tail model, executed respectively on the mobile device and on the edge server. Ultimately, this may reduce bandwidth usage as well as energy consumption. Another approach, called early exiting (EE), trains models to present multiple "exits" earlier in the architecture, each providing increasingly higher target accuracy. Therefore, the trade-off between accuracy and delay can be tuned according to the current conditions or application demands. In this paper, we provide a comprehensive survey of the state of the art in SC and EE strategies by presenting a comparison of the most relevant approaches. We conclude the paper by providing a set of compelling research challenges.}, bibtype = {article}, author = {Matsubara, Yoshitomo and Levorato, Marco and Restuccia, Francesco}, journal = {ACM Computing Surveys}, number = {4} }
@article{ title = {Superpoint-guided Semi-supervised Semantic Segmentation of 3D Point Clouds}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2107.03601}, id = {bb061536-5750-3d90-9594-45493dcdca2d}, created = {2022-03-09T09:35:04.531Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:29:33.498Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {3D point cloud semantic segmentation is a challenging topic in the computer vision field. Most of the existing methods in literature require a large amount of fully labeled training data, but it is extremely time-consuming to obtain these training data by manually labeling massive point clouds. Addressing this problem, we propose a superpoint-guided semi-supervised segmentation network for 3D point clouds, which jointly utilizes a small portion of labeled scene point clouds and a large number of unlabeled point clouds for network training. The proposed network is iteratively updated with its predicted pseudo labels, where a superpoint generation module is introduced for extracting superpoints from 3D point clouds, and a pseudo-label optimization module is explored for automatically assigning pseudo labels to the unlabeled points under the constraint of the extracted superpoints. Additionally, there are some 3D points without pseudo-label supervision. We propose an edge prediction module to constrain features of edge points. A superpoint feature aggregation module and a superpoint feature consistency loss function are introduced to smooth superpoint features. Extensive experimental results on two 3D public datasets demonstrate that our method can achieve better performance than several state-of-the-art point cloud segmentation networks and several popular semi-supervised segmentation methods with few labeled scenes.}, bibtype = {article}, author = {Deng, Shuang and Dong, Qiulei and Liu, Bo and Hu, Zhanyi} }
@article{ title = {AirNet: Neural Network Transmission over the Air}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2105.11166}, id = {c0b9ce3f-b1ee-370b-943c-745e9f0d6dcf}, created = {2022-03-09T09:35:04.532Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T09:35:12.876Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {State-of-the-art performance for many emerging edge applications is achieved by deep neural networks (DNNs). Often, these DNNs are location and time sensitive, and the parameters of a specific DNN must be delivered from an edge server to the edge device rapidly and efficiently to carry out time-sensitive inference tasks. We introduce AirNet, a novel training and analog transmission method that allows efficient wireless delivery of DNNs. We first train the DNN with noise injection to counter the wireless channel noise. We also employ pruning to reduce the channel bandwidth necessary for transmission, and perform knowledge distillation from a larger model to achieve satisfactory performance, despite the channel perturbations. We show that AirNet achieves significantly higher test accuracy compared to digital alternatives under the same bandwidth and power constraints. It also exhibits graceful degradation with channel quality, which reduces the requirement for accurate channel estimation.}, bibtype = {article}, author = {Jankowski, Mikolaj and Gunduz, Deniz and Mikolajczyk, Krystian}, number = {677854} }
@article{ title = {Point Cloud Instance Segmentation of Indoor Scenes Using Learned Pairwise Patch Relations}, type = {article}, year = {2021}, keywords = {Point clouds,classifier,indoor scenes,instance segmentation,patch relations}, pages = {15891-15901}, volume = {9}, id = {fb50a7a8-2eb9-31aa-8157-74dfa9015592}, created = {2022-03-18T10:02:57.315Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-18T10:03:05.478Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {Indoor scene understanding is an active research topic that has recently gained increasing attention in both computer graphics and computer vision community. This paper presents a novel automatic point cloud instance abstraction and segmentation of indoor scenes using learned pairwise planar patch relations. Planar patches have sufficient representative power to abstract the semantics in mostly man-made 3D indoor scenes. To exploit the planar patch relationships, a new indoor point clouds dataset is automatically derived from the public dataset ScanNet, which can be adopted for training a pairwise patch relation classifier. Given the pre-processed indoor scene point clouds, a set of planar patches are generated to provide robust and compact representation of indoor objects and parts. For each extracted planar patch, a feature descriptor in view of its geometric properties and appearance is calculated. Owing to the proposed feature descriptors, the relations of pairwise planar patch are measured and can be fed into the pre-trained patch relation classifier. To yield the instance segmentation of indoor scenes, a robust graph-based algorithm is adopted to group these planar patches which incorporate the learned patch relations. Experimental results demonstrate that our proposed approach can produce instance semantic segmentation of various cluttered indoor scenes robustly and efficiently.}, bibtype = {article}, author = {Yu, Lijie and Sun, Yuliang and Zhang, Xudong and Miao, Yongwei and Zhang, Xiuli}, doi = {10.1109/ACCESS.2021.3051618}, journal = {IEEE Access} }
@article{ title = {Multi-scale Point Cloud Analysis}, type = {article}, year = {2021}, id = {db3c2bfe-247b-33f1-a4b2-438adab69ed3}, created = {2022-03-22T06:52:16.996Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-22T06:52:21.360Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, bibtype = {article}, author = {Lejemble, Thibault} }
@article{ title = {Facevae: Generation of a 3D geometric object using variational autoencoders}, type = {article}, year = {2021}, keywords = {3D geometry,Deep learning,Generation model,Graph data,VAE}, pages = {1-14}, volume = {10}, id = {25184c64-4522-35b7-8d56-a150264d7e9c}, created = {2022-03-23T06:17:59.176Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:13.688Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Park2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Deep learning for 3D data has become a popular research theme in many fields. However, most of the research on 3D data is based on voxels, 2D images, and point clouds. At actual industrial sites, face-based geometry data are being used, but their direct application to industrial sites remains limited due to a lack of existing research. In this study, to overcome these limitations, we present a face-based variational autoencoder (FVAE) model that generates 3D geometry data using a variational autoencoder (VAE) model directly from face-based geometric data. Our model improves the existing node and edge-based adjacency matrix and optimizes it for geometric learning by using a face-and edge-based adjacency matrix according to the 3D geometry structure. In the experiment, we achieved the result of generating adjacency matrix information with 72% precision and 69% recall through end-to-end learning of Face-Based 3D Geometry. In addition, we presented various structurization methods for 3D unstructured geometry and compared their performance, and proved the method to effectively perform reconstruction of the learned structured data through experiments.}, bibtype = {article}, author = {Park, Sungsoo and Kim, Hyeoncheol}, doi = {10.3390/electronics10222792}, journal = {Electronics (Switzerland)}, number = {22} }
@article{ title = {Hierarchical Graph-Convolutional Variational AutoEncoding for Generative Modelling of Human Motion}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2111.12602}, id = {53726edd-7ad6-31b2-b958-9d5e223d7f8f}, created = {2022-03-23T06:17:59.177Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:13.113Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bourached2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Models of human motion commonly focus either on trajectory prediction or action classification but rarely both. The marked heterogeneity and intricate compositionality of human motion render each task vulnerable to the data degradation and distributional shift common to real-world scenarios. A sufficiently expressive generative model of action could in theory enable data conditioning and distributional resilience within a unified framework applicable to both tasks. Here we propose a novel architecture based on hierarchical variational autoencoders and deep graph convolutional neural networks for generating a holistic model of action over multiple time-scales. We show this Hierarchical Graph-convolutional Variational Autoencoder (HG-VAE) to be capable of generating coherent actions, detecting out-of-distribution data, and imputing missing data by gradient ascent on the model's posterior. Trained and evaluated on H3.6M and the largest collection of open source human motion data, AMASS, we show HG-VAE can facilitate downstream discriminative learning better than baseline models.}, bibtype = {article}, author = {Bourached, Anthony and Gray, Robert and Griffiths, Ryan-Rhys and Jha, Ashwani and Nachev, Parashkev} }
@article{ title = {A Two-Stage Cascade Model with Variational Autoencoders and Attention Gates for MRI Brain Tumor Segmentation}, type = {article}, year = {2021}, keywords = {Attention gate,Brain tumor segmentation,Encoder-decoder network,Variational autoencoder}, pages = {435-447}, volume = {12658 LNCS}, id = {c849ae67-71d8-3a07-b804-07d823146590}, created = {2022-03-23T06:17:59.178Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:12.522Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lyu2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Automatic MRI brain tumor segmentation is of vital importance for the disease diagnosis, monitoring, and treatment planning. In this paper, we propose a two-stage encoder-decoder based model for brain tumor subregional segmentation. Variational autoencoder regularization is utilized in both stages to prevent the overfitting issue. The second-stage network adopts attention gates and is trained additionally using an expanded dataset formed by the first-stage outputs. On the BraTS 2020 validation dataset, the proposed method achieves the mean Dice score of 0.9041, 0.8350, and 0.7958, and Hausdorff distance (95%) of 4.953, 6.299, 23.608 for the whole tumor, tumor core, and enhancing tumor, respectively. The corresponding results on the BraTS 2020 testing dataset are 0.8729, 0.8357, and 0.8205 for Dice score, and 11.4288, 19.9690, and 15.6711 for Hausdorff distance. The code is publicly available at https://github.com/shu-hai/two-stage-VAE-Attention-gate-BraTS2020.}, bibtype = {article}, author = {Lyu, Chenggang and Shu, Hai}, doi = {10.1007/978-3-030-72084-1_39}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {NeRF-VAE: A Geometry Aware 3D Scene Generative Model}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.00587}, id = {2a6ac3d1-9a73-396e-a7da-c9e670768a81}, created = {2022-03-23T06:17:59.184Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:13.506Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kosiorek2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We propose NeRF-VAE, a 3D scene generative model that incorporates geometric structure via NeRF and differentiable volume rendering. In contrast to NeRF, our model takes into account shared structure across scenes, and is able to infer the structure of a novel scene -- without the need to re-train -- using amortized inference. NeRF-VAE's explicit 3D rendering process further contrasts previous generative models with convolution-based rendering which lacks geometric structure. Our model is a VAE that learns a distribution over radiance fields by conditioning them on a latent scene representation. We show that, once trained, NeRF-VAE is able to infer and render geometrically-consistent scenes from previously unseen 3D environments using very few input images. We further demonstrate that NeRF-VAE generalizes well to out-of-distribution cameras, while convolutional models do not. Finally, we introduce and study an attention-based conditioning mechanism of NeRF-VAE's decoder, which improves model performance.}, bibtype = {article}, author = {Kosiorek, Adam R. and Strathmann, Heiko and Zoran, Daniel and Moreno, Pol and Schneider, Rosalia and Mokrá, Soňa and Rezende, Danilo J.} }
@article{ title = {A Unified 3D Human Motion Synthesis Model via Conditional Variational Auto-Encoder *}, type = {article}, year = {2021}, pages = {11645-11655}, id = {9e6f497a-b560-3fc8-ad30-9d9ba7820bbf}, created = {2022-03-23T06:17:59.205Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:12.917Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cai2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We present a unified and flexible framework to address the generalized problem of 3D motion synthesis that covers the tasks of motion prediction, completion, interpolation , and spatial-temporal recovery. Since these tasks have different input constraints and various fidelity and diversity requirements, most existing approaches only cater to a specific task or use different architectures to address various tasks. Here we propose a unified framework based on Conditional Variational Auto-Encoder (CVAE), where we treat any arbitrary input as a masked motion series. Notably, by considering this problem as a conditional generation process , we estimate a parametric distribution of the missing regions based on the input conditions, from which to sample and synthesize the full motion series. To further allow the flexibility of manipulating the motion style of the generated series, we design an Action-Adaptive Modulation (AAM) to propagate the given semantic guidance through the whole sequence. We also introduce a cross-attention mechanism to exploit distant relations among decoder and encoder features for better realism and global consistency. We conducted extensive experiments on Human 3.6M and CMU-Mocap. The results show that our method produces coherent and realistic results for various motion synthesis tasks, *}, bibtype = {article}, author = {Cai, Yujun and Wang, Yiwei and Zhu, Yiheng and Cham, Tat-Jen and Cai, Jianfei and Yuan, Junsong and Liu, Jun and Zheng, Chuanxia and Yan, Sijie and Ding, Henghui and Shen, Xiaohui and Liu, Ding and Thalmann, Nadia Magnenat}, journal = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)} }
@article{ title = {EditVAE: Unsupervised Part-Aware Controllable 3D Point Cloud Shape Generation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2110.06679}, id = {c0f68cca-5451-3d90-b18f-663e89a38896}, created = {2022-03-23T06:17:59.286Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-21T08:31:58.519Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Li2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {This paper tackles the problem of parts-aware point cloud generation. Unlike existing works which require the point cloud to be segmented into parts a priori, our parts-aware editing and generation is performed in an unsupervised manner. We achieve this with a simple modification of the Variational Auto-Encoder which yields a joint model of the point cloud itself along with a schematic representation of it as a combination of shape primitives. In particular, we introduce a latent representation of the point cloud which can be decomposed into a disentangled representation for each part of the shape. These parts are in turn disentangled into both a shape primitive and a point cloud representation, along with a standardising transformation to a canonical coordinate system. The dependencies between our standardising transformations preserve the spatial dependencies between the parts in a manner which allows meaningful parts-aware point cloud generation and shape editing. In addition to the flexibility afforded by our disentangled representation, the inductive bias introduced by our joint modelling approach yields the state-of-the-art experimental results on the ShapeNet dataset.}, bibtype = {article}, author = {Li, Shidi and Liu, Miaomiao and Walder, Christian} }
@article{ title = {Automated building change detection with amodal completion of point clouds}, type = {article}, year = {2021}, keywords = {Building information modeling,Change detection,Deep learning,Point cloud,Point cloud completion}, pages = {103568}, volume = {124}, websites = {https://doi.org/10.1016/j.autcon.2021.103568}, publisher = {Elsevier B.V.}, id = {52dbb417-9e35-3c96-8767-d7bf9dcb8d8f}, created = {2022-03-23T06:17:59.298Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:03.012Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Czerniawski2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {When updating digital models of existing buildings, changes in the built environment are detected by comparing outdated BIMs with captured point clouds representing current conditions. Here we show that point cloud completion (i.e. automated filling-in of missing data) improves the accuracy of change detection. We perform point cloud completion using a hierarchical deep variational autoencoder (a type of artificial neural network) modified to include skip connections between the convolution and deconvolution layers. The resulting receiver operating characteristic curve shows that completion boosts change detection performance from a total area under the curve of 0.55 to 0.75. Completion achieves this by eliminating differences between the BIM and point cloud inputs that are a consequence of incompleteness while distilling the differences due to building change. We anticipate that automated change detection methods with resilience to imperfect data will become more critical as automated building analyses become increasingly abstracted from data collection.}, bibtype = {article}, author = {Czerniawski, Thomas and Ma, Jong Won and Fernanda Leite, undefined}, doi = {10.1016/j.autcon.2021.103568}, journal = {Automation in Construction}, number = {January} }
@article{ title = {Setvae: Learning hierarchical composition for generative modeling of set-structured data}, type = {article}, year = {2021}, pages = {15054-15063}, id = {7ac8b9c2-f0b5-3546-9927-03fd86136b31}, created = {2022-03-23T06:17:59.333Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:11.879Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kim2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Generative modeling of set-structured data, such as point clouds, requires reasoning over local and global structures at various scales. However, adopting multi-scale frameworks for ordinary sequential data to a set-structured data is nontrivial as it should be invariant to the permutation of its elements. In this paper, we propose SetVAE, a hierarchical variational autoencoder for sets. Motivated by recent progress in set encoding, we build SetVAE upon attentive modules that first partition the set and project the partition back to the original cardinality. Exploiting this module, our hierarchical VAE learns latent variables at multiple scales, capturing coarse-to-fine dependency of the set elements while achieving permutation invariance. We evaluate our model on point cloud generation task and achieve competitive performance to the prior arts with substantially smaller model capacity. We qualitatively demonstrate that our model generalizes to unseen set sizes and learns interesting subset relations without supervision. Our implementation is available at https://github.com/jw9730/setvae.}, bibtype = {article}, author = {Kim, Jinwoo and Yoo, Jaehoon and Lee, Juho and Hong, Seunghoon}, doi = {10.1109/CVPR46437.2021.01481}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Action-Conditioned 3D Human Motion Synthesis with Transformer VAE}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2104.05670}, id = {640ac617-3379-352d-b4f3-0c15968cf81b}, created = {2022-03-23T06:17:59.354Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:12.712Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Petrovich2021}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We tackle the problem of action-conditioned generation of realistic and diverse human motion sequences. In contrast to methods that complete, or extend, motion sequences, this task does not require an initial pose or sequence. Here we learn an action-aware latent representation for human motions by training a generative variational autoencoder (VAE). By sampling from this latent space and querying a certain duration through a series of positional encodings, we synthesize variable-length motion sequences conditioned on a categorical action. Specifically, we design a Transformer-based architecture, ACTOR, for encoding and decoding a sequence of parametric SMPL human body models estimated from action recognition datasets. We evaluate our approach on the NTU RGB+D, HumanAct12 and UESTC datasets and show improvements over the state of the art. Furthermore, we present two use cases: improving action recognition through adding our synthesized data to training, and motion denoising. Code and models are available on our project page.}, bibtype = {article}, author = {Petrovich, Mathis and Black, Michael J. and Varol, Gül}, doi = {10.1109/iccv48922.2021.01080} }
@article{ title = {A brief survey on RGB-D semantic segmentation using deep learning}, type = {article}, year = {2021}, keywords = {Deep learning,RGB-D datasets,Semantic segmentation}, pages = {102080}, volume = {70}, websites = {https://doi.org/10.1016/j.displa.2021.102080}, publisher = {Elsevier B.V.}, id = {9683bc04-7e79-3270-81b6-3a1da421f52a}, created = {2022-03-24T08:01:31.367Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-24T08:01:45.062Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {Semantic segmentation is referred to as a process of linking each pixel in an image to a class label. With this pragmatic technique, it is possible to recognize different objects in an RGB image based on the color and texture, and hence it becomes easier to evaluate. Recently, researchers could perform semantic segmentation pretty well in RGB images. However, the methods based on RGB image lack enough information to realize semantic segmentation of complex scenes. RGB-D semantic segmentation with depth information has been proved to achieve better segmentation results by a lot of experiments, but there is a lack of a comprehensive survey. In this paper, the main purpose is to offer a detailed review of RGB-D semantic segmentation according to the research progress in recent years. Specifically, recently updated RGB-D datasets will be focused on first, and problems on RGB-D semantic segmentation will be discussed. In the end, a comprehensive analysis is carried out on recent methods and their analysis of the semantic segmentation.}, bibtype = {article}, author = {Wang, Changshuo and Wang, Chen and Li, Weijun and Wang, Haining}, doi = {10.1016/j.displa.2021.102080}, journal = {Displays}, number = {September} }
@article{ title = {CoFiNet: Reliable Coarse-to-fine Correspondences for Robust Point Cloud Registration}, type = {article}, year = {2021}, websites = {https://arxiv.org/abs/2110.14076v1}, month = {10}, day = {26}, id = {48a3d272-6530-3ee9-a6f3-9edf8ca82ca2}, created = {2022-03-28T07:17:57.661Z}, accessed = {2022-03-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-31T06:33:43.585Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We study the problem of extracting correspondences between a pair of point clouds for registration. For correspondence retrieval, existing works benefit from matching sparse keypoints detected from dense points but usually struggle to guarantee their repeatability. To address this issue, we present CoFiNet - Coarse-to-Fine Network which extracts hierarchical correspondences from coarse to fine without keypoint detection. On a coarse scale and guided by a weighting scheme, our model firstly learns to match down-sampled nodes whose vicinity points share more overlap, which significantly shrinks the search space of a consecutive stage. On a finer scale, node proposals are consecutively expanded to patches that consist of groups of points together with associated descriptors. Point correspondences are then refined from the overlap areas of corresponding patches, by a density-adaptive matching module capable to deal with varying point density. Extensive evaluation of CoFiNet on both indoor and outdoor standard benchmarks shows our superiority over existing methods. Especially on 3DLoMatch where point clouds share less overlap, CoFiNet significantly outperforms state-of-the-art approaches by at least 5% on Registration Recall, with at most two-third of their parameters.}, bibtype = {article}, author = {Yu, Hao and Li, Fu and Saleh, Mahdi and Busam, Benjamin and Ilic, Slobodan}, doi = {10.48550/arxiv.2110.14076} }
@article{ title = {Statistical Estimation of the Kullback–Leibler Divergence}, type = {article}, year = {2021}, keywords = {Gaussian model,Kullback–Leibler divergence,Shannon differential entropy,asymptotic behavior,k-nearest neighbor statistics,mixtures,statistical estimates}, pages = {544}, volume = {9}, websites = {https://www.mdpi.com/2227-7390/9/5/544}, month = {1}, id = {c2c3961f-21c1-3e9a-bfd3-1f8e9a133643}, created = {2022-03-28T09:45:01.047Z}, accessed = {2021-09-14}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:13.434Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bulinskiStatisticalEstimationKullback2021}, source_type = {article}, notes = {Number: 5<br/>Publisher: Multidisciplinary Digital Publishing Institute}, private_publication = {false}, abstract = {Asymptotic unbiasedness and L2-consistency are established, under mild conditions, for the estimates of the Kullback\–Leibler divergence between two probability measures in Rd, absolutely continuous with respect to (w.r.t.) the Lebesgue measure. These estimates are based on certain k-nearest neighbor statistics for pair of independent identically distributed (i.i.d.) due vector samples. The novelty of results is also in treating mixture models. In particular, they cover mixtures of nondegenerate Gaussian measures. The mentioned asymptotic properties of related estimators for the Shannon entropy and cross-entropy are strengthened. Some applications are indicated.}, bibtype = {article}, author = {Bulinski, Alexander and Dimitrov, Denis}, doi = {10.3390/math9050544}, journal = {Mathematics}, number = {5} }
@article{ title = {Knowledge Distillation and Student-Teacher Learning for Visual Intelligence: A Review and New Outlooks}, type = {article}, year = {2021}, keywords = {Computational modeling,Deep neural networks (DNN),Knowledge distillation (KD),Knowledge transfer,Measurement,Speech recognition,Student-Teacher learning (S-T),Task analysis,Training,Visual intelligence.,Visualization}, pages = {1}, id = {321ac3b3-55e4-36ce-94c4-702bb2c6e7e4}, created = {2022-03-28T09:45:01.256Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:26.749Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wangKnowledgeDistillationStudentTeacher2021}, source_type = {article}, short_title = {Knowledge Distillation and Student-Teacher }, notes = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence}, private_publication = {false}, abstract = {Deep neural models, in recent years, have been successful in almost every field. However, these models are huge, demanding heavy computation power. Besides, the performance boost is highly dependent on redundant labeled data. To achieve faster speeds and to handle the problems caused by the lack of labeled data, knowledge distillation (KD) has been proposed to transfer information learned from one model to another. KD is often characterized by the so-called ‘Student-Teacher’ (S-T) learning framework and has been broadly applied in model compression and knowledge transfer. This paper is about KD and S-T learning, which are being actively studied in recent years. First, we aim to provide explanations of what KD is and how/why it works. Then, we provide a comprehensive survey on the recent progress of KD methods together with S-T frameworks typically used for vision tasks. In general, we investigate some fundamental questions that have been driving this research area and thoroughly generalize the research progress and technical details. Additionally, we systematically analyze the research status of KD in vision applications. Finally, we discuss the potentials and open challenges of existing methods and prospect the future directions of KD and S-T learning.}, bibtype = {article}, author = {Wang, Lin and Yoon, Kuk-Jin}, doi = {10.1109/TPAMI.2021.3055564}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence} }
@article{ title = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, type = {article}, year = {2021}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Re,Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/2010.11929}, month = {6}, id = {ffa066fd-9bc0-3f2a-9a44-ceb9c3ce9d4f}, created = {2022-03-28T09:45:01.398Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:34.686Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {dosovitskiyImageWorth16x162021}, source_type = {article}, short_title = {An Image is Worth 16x16 Words}, notes = {arXiv: 2010.11929}, private_publication = {false}, abstract = {While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.}, bibtype = {article}, author = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil}, journal = {arXiv:2010.11929 [cs]} }
@inproceedings{ title = {ToFNest: Efficient Normal Estimation for Time-of-Flight Depth Cameras}, type = {inproceedings}, year = {2021}, pages = {1791-1798}, websites = {https://openaccess.thecvf.com/content/ICCV2021W/ACVR/html/Molnar_ToFNest_Efficient_Normal_Estimation_for_Time-of-Flight_Depth_Cameras_ICCVW_2021_paper.html}, id = {0c1069c6-961c-3920-9f15-8117f67a94e9}, created = {2022-03-28T09:45:01.400Z}, accessed = {2022-01-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:22.025Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {molnarToFNestEfficientNormal2021}, source_type = {inproceedings}, short_title = {ToFNest}, private_publication = {false}, bibtype = {inproceedings}, author = {Molnár, Szilárd and Kelényi, Benjamin and Tamás, Levente} }
@inproceedings{ title = {Deep Optimized Priors for 3D Shape Modeling and Reconstruction}, type = {inproceedings}, year = {2021}, pages = {3269-3278}, websites = {https://openaccess.thecvf.com/content/CVPR2021/html/Yang_Deep_Optimized_Priors_for_3D_Shape_Modeling_and_Reconstruction_CVPR_2021_paper.html}, id = {93a3ccc0-a72f-3ac3-9cc5-ecd3609e91ec}, created = {2022-03-28T09:45:01.555Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:02.695Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yangDeepOptimizedPriors2021}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Yang, Mingyue and Wen, Yuxin and Chen, Weikai and Chen, Yongwei and Jia, Kui} }
@article{ title = {Feature Pyramid Network Based Efficient Normal Estimation and Filtering for Time-of-Flight Depth Cameras}, type = {article}, year = {2021}, pages = {6257}, volume = {21}, month = {9}, id = {b814c2ae-0e98-3fa6-a236-e73d5d250c4f}, created = {2022-03-28T09:45:01.684Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-24T10:35:25.892Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {molnarFeaturePyramidNetwork2021}, source_type = {article}, private_publication = {false}, abstract = {In this paper, an efficient normal estimation and filtering method for depth images acquired by Time-of-Flight (ToF) cameras is proposed. The method is based on a common feature pyramid networks (FPN) architecture. The normal estimation method is called ToFNest, and the filtering method ToFClean. Both of these low-level 3D point cloud processing methods start from the 2D depth images, projecting the measured data into the 3D space and computing a task-specific loss function. Despite the simplicity, the methods prove to be efficient in terms of robustness and runtime. In order to validate the methods, extensive evaluations on public and custom datasets were performed. Compared with the state-of-the-art methods, the ToFNest and ToFClean algorithms are faster by an order of magnitude without losing precision on public datasets.}, bibtype = {article}, author = {Molnár, Szilárd and Kelenyi, Benjamin and Tamas, Levente}, doi = {10.3390/s21186257}, journal = {Sensors} }
@article{ title = {Plausible 3D Face Wrinkle Generation Using Variational Autoencoders}, type = {article}, year = {2021}, pages = {1}, websites = {https://www.computer.org/csdl/journal/tg/5555/01/09321747/1qmbmWE93ZS}, month = {1}, id = {8e665205-a603-3249-a9ae-93610b2b42fb}, created = {2022-03-28T09:45:02.307Z}, accessed = {2022-03-22}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:17.033Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {dengPlausible3DFace2021}, source_type = {article}, notes = {Publisher: IEEE Computer Society}, private_publication = {false}, abstract = {Realistic 3D facial modeling and animation have been increasingly used in many graphics, animation, and virtual reality applications. However, generating realistic fine-scale wrinkles on 3D faces, in particular, on animated 3D faces, is still a challenging problem that is far away from being resolved. In this paper we propose an end-to-end system to automatically augment coarse-scale 3D faces with synthesized fine-scale geometric wrinkles. By formulating the wrinkle generation problem as a supervised generation task, we implicitly model the continuous space of face wrinkles via a compact generative model, such that plausible face wrinkles can be generated through effective sampling and interpolation in the space. We also introduce a complete pipeline to transfer the synthesized wrinkles between faces with different shapes and topologies. Through many experiments, we demonstrate our method can robustly synthesize plausible fine-scale wrinkles on a variety of coarse-scale 3D faces with different shapes and expressions.}, bibtype = {article}, author = {Deng, Qixin and Ma, Luming and Jin, Aobo and Bi, Huikun and Le, Binh Huy and Deng, Zhigang}, doi = {10.1109/TVCG.2021.3051251}, journal = {IEEE Transactions on Visualization and Computer Graphics}, number = {01} }
@inproceedings{ title = {Task-Generic Hierarchical Human Motion Prior using VAEs}, type = {inproceedings}, year = {2021}, keywords = {Animation,Interpolation,Motion estimation,Pose estimation,Predictive models,Three-dimensional displays,Training,n/a}, pages = {771-781}, month = {12}, id = {32e7dd96-4d04-3ba9-8830-59d6d3f34a01}, created = {2022-03-28T09:45:02.580Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:35.776Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liTaskGenericHierarchicalHuman2021}, source_type = {inproceedings}, notes = {ISSN: 2475-7888}, private_publication = {false}, abstract = {A deep generative model that describes human motions can benefit a wide range of fundamental computer vision and graphics tasks, such as providing robustness to video-based human pose estimation, predicting complete body movements for motion capture systems during occlusions, and assisting key frame animation with plausible movements. In this paper, we present a method for learning complex human motions independent of specific tasks using a combined global and local latent space to facilitate coarse and fine-grained modeling. Specifically, we propose a hierarchical motion variational autoencoder (HM-VAE) that consists of a 2-level hierarchical latent space. While the global latent space captures the overall global body motion, the local latent space enables to capture the refined poses of the different body parts. We demonstrate the effectiveness of our hierarchical motion variational autoencoder in a variety of tasks including video-based human pose estimation, motion completion from partial observations, and motion synthesis from sparse key-frames. Even though, our model has not been trained for any of these tasks specifically, it provides superior performance than task-specific alternatives. Our general-purpose human motion prior model can fix corrupted human body animations and generate complete movements from incomplete observations.}, bibtype = {inproceedings}, author = {Li, Jiaman and Villegas, Ruben and Ceylan, Duygu and Yang, Jimei and Kuang, Zhengfei and Li, Hao and Zhao, Yajie}, doi = {10.1109/3DV53792.2021.00086}, booktitle = {2021 International Conference on 3D Vision (3DV)} }
@article{ title = {Sparse Data Driven Mesh Deformation}, type = {article}, year = {2021}, keywords = {Computational modeling,Data driven,Deformable models,Geometry,Interpolation,Manifolds,Shape,Strain,large scale deformation,real-time deformation,sparsity}, pages = {2085-2100}, volume = {27}, month = {3}, id = {0a9c6e1a-47ea-3992-aad8-bf0760490b23}, created = {2022-03-28T09:45:02.604Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:59.342Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gaoSparseDataDriven2021}, source_type = {article}, notes = {Conference Name: IEEE Transactions on Visualization and Computer Graphics}, private_publication = {false}, abstract = {Example-based mesh deformation methods are powerful tools for realistic shape editing. However, existing techniques typically combine all the example deformation modes, which can lead to overfitting, i.e., using an overly complicated model to explain the user-specified deformation. This leads to implausible or unstable deformation results, including unexpected global changes outside the region of interest. To address this fundamental limitation, we propose a sparse blending method that automatically selects a smaller number of deformation modes to compactly describe the desired deformation. This along with a suitably chosen deformation basis including spatially localized deformation modes leads to significant advantages, including more meaningful, reliable, and efficient deformations because fewer and localized deformation modes are applied. To cope with large rotations, we develop a simple but effective representation based on polar decomposition of deformation gradients, which resolves the ambiguity of large global rotations using an as-consistent-as-possible global optimization. This simple representation has a closed form solution for derivatives, making it efficient for our sparse localized representation and thus ensuring interactive performance. Experimental results show that our method outperforms state-of-the-art data-driven mesh deformation methods, for both quality of results and efficiency.}, bibtype = {article}, author = {Gao, Lin and Lai, Yu-Kun and Yang, Jie and Zhang, Ling-Xiao and Xia, Shihong and Kobbelt, Leif}, doi = {10.1109/TVCG.2019.2941200}, journal = {IEEE Transactions on Visualization and Computer Graphics}, number = {3} }
@article{ title = {On relationships between the Pearson and the distance correlation coefficients}, type = {article}, year = {2021}, keywords = {Distance correlation,Distance covariance,Mixture distribution,Pearson correlation,Rademacher distribution}, pages = {108960}, volume = {169}, websites = {https://www.sciencedirect.com/science/article/pii/S0167715220302637}, month = {2}, id = {54d6eda3-9dcc-3de9-8891-15fdd39d1fd2}, created = {2022-03-28T09:45:03.188Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:43.569Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {edelmannRelationshipsPearsonDistance2021}, source_type = {article}, private_publication = {false}, abstract = {In this paper we show that for any fixed Pearson correlation coefficient strictly between −1 and 1, the distance correlation coefficient can take any value in the open unit interval (0,1).}, bibtype = {article}, author = {Edelmann, Dominic and Móri, Tamás F and Székely, Gábor J}, doi = {10.1016/j.spl.2020.108960}, journal = {Statistics \& Probability Letters} }
@article{ title = {Tutorial on Variational Autoencoders}, type = {article}, year = {2021}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1606.05908}, month = {1}, id = {ad51d3c4-4e54-346a-8c24-25c3db992d38}, created = {2022-03-28T09:45:03.303Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:22.545Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {doerschTutorialVariationalAutoencoders2021}, source_type = {article}, notes = {arXiv: 1606.05908}, private_publication = {false}, abstract = {In just three years, Variational Autoencoders (VAEs) have emerged as one of the most popular approaches to unsupervised learning of complicated distributions. VAEs are appealing because they are built on top of standard function approximators (neural networks), and can be trained with stochastic gradient descent. VAEs have already shown promise in generating many kinds of complicated data, including handwritten digits, faces, house numbers, CIFAR images, physical models of scenes, segmentation, and predicting the future from static images. This tutorial introduces the intuitions behind VAEs, explains the mathematics behind them, and describes some empirical behavior. No prior knowledge of variational Bayesian methods is assumed.}, bibtype = {article}, author = {Doersch, Carl}, journal = {arXiv:1606.05908 [cs, stat]} }
@article{ title = {Mobil robotkar felhasználása a mezőgazdaságban: Mobile robotic manipulator for precision agriculture}, type = {article}, year = {2021}, keywords = {navigáció}, pages = {55-61}, websites = {https://ojs.emt.ro/index.php/enelko-szamokt/article/view/622}, month = {10}, id = {437ebb46-16b4-3dd9-991f-caf5bcfb0459}, created = {2022-03-28T09:45:03.363Z}, accessed = {2022-01-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:59.575Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {orsMobilRobotkarFelhasznalasa2021}, source_type = {article}, short_title = {Mobil robotkar felhasználása a mezőgazdaságban}, private_publication = {false}, abstract = {In this paper a mobile manipulator arm for automated harvesting is proposed. This device can be useful in such places, where the human resources are too expensive. The main parts of the assembly are a four-wheeled robot and a robotic arm, depth sensing camera and embedded GPU device. The device can navigate through obstacles without hitting them, and find a path to the targeted plant. With a camera it can recognize the fruits, and based on visual servoing it can crop them. Kivonat Ebben a dolgozatban bemutatunk egy mobil robot kart, amely nagy segítséget nyújthat az agrikultúrában, főleg olyan helyeken, ahol az emberi munkaerőhiány van. Az eszköz főbb alkotórészei egy négykerekű mozgó robot, valamint egy robotkar. A szerkezet képes elnavigálni az akadályok között és amikor talál egy növényt, képes felismerni rajta a gyümölcsöt, majd azt leszedni, és elszállítani.}, bibtype = {article}, author = {Örs, KÖLLŐ Magor and Szilárd, Molnár and Levente, Tamás}, journal = {Energetika-Elektrotechnika – Számítástechnika és Oktatás Multi-konferencia} }
@article{ title = {Learned Point Cloud Geometry Compression}, type = {article}, year = {2021}, keywords = {Computer Science - Computer Vision and Pattern Rec,Electrical Engineering and Systems Science - Imag}, pages = {4909-4923}, volume = {31}, websites = {http://arxiv.org/abs/1909.12037}, month = {12}, id = {899c47d0-e122-3173-a877-e973e289ad74}, created = {2022-03-28T09:45:03.904Z}, accessed = {2022-01-05}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:27.499Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wangLearnedPointCloud2021}, source_type = {article}, notes = {arXiv: 1909.12037}, private_publication = {false}, abstract = {This paper presents a novel end-to-end Learned Point Cloud Geometry Compression (a.k.a., Learned-PCGC) framework, to efficiently compress the point cloud geometry (PCG) using deep neural networks (DNN) based variational autoencoders (VAE). In our approach, PCG is first voxelized, scaled and partitioned into non-overlapped 3D cubes, which is then fed into stacked 3D convolutions for compact latent feature and hyperprior generation. Hyperpriors are used to improve the conditional probability modeling of latent features. A weighted binary cross-entropy (WBCE) loss is applied in training while an adaptive thresholding is used in inference to remove unnecessary voxels and reduce the distortion. Objectively, our method exceeds the geometry-based point cloud compression (G-PCC) algorithm standardized by well-known Moving Picture Experts Group (MPEG) with a significant performance margin, e.g., at least 60\% BD-Rate (Bjontegaard Delta Rate) gains, using common test datasets. Subjectively, our method has presented better visual quality with smoother surface reconstruction and appealing details, in comparison to all existing MPEG standard compliant PCC methods. Our method requires about 2.5MB parameters in total, which is a fairly small size for practical implementation, even on embedded platform. Additional ablation studies analyze a variety of aspects (e.g., cube size, kernels, etc) to explore the application potentials of our learned-PCGC.}, bibtype = {article}, author = {Wang, Jianqiang and Zhu, Hao and Ma, Zhan and Chen, Tong and Liu, Haojie and Shen, Qiu}, doi = {10.1109/TCSVT.2021.3051377}, journal = {IEEE Transactions on Circuits and Systems for Video Technology}, number = {12} }
@inproceedings{ title = {Private-Shared Disentangled Multimodal VAE for Learning of Latent Representations}, type = {inproceedings}, year = {2021}, pages = {1692-1700}, websites = {https://openaccess.thecvf.com/content/CVPR2021W/MULA/html/Lee_Private-Shared_Disentangled_Multimodal_VAE_for_Learning_of_Latent_Representations_CVPRW_2021_paper.html}, id = {246f0810-870d-3ed4-8765-aef5efe93149}, created = {2022-03-28T09:45:04.311Z}, accessed = {2021-09-29}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:24.837Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {leePrivateSharedDisentangledMultimodal2021}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Lee, Mihee and Pavlovic, Vladimir} }
@inproceedings{ title = {NeRF-VAE: A Geometry Aware 3D Scene Generative Model}, type = {inproceedings}, year = {2021}, pages = {5742-5752}, websites = {https://proceedings.mlr.press/v139/kosiorek21a.html}, month = {7}, publisher = {PMLR}, id = {90ecafa0-c3e0-3a6c-80b9-e5ec026decfe}, created = {2022-03-28T09:45:04.596Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:34.628Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kosiorekNeRFVAEGeometryAware2021a}, source_type = {inproceedings}, short_title = {NeRF-VAE}, notes = {ISSN: 2640-3498}, private_publication = {false}, abstract = {We propose NeRF-VAE, a 3D scene generative model that incorporates geometric structure via Neural Radiance Fields (NeRF) and differentiable volume rendering. In contrast to NeRF, our model takes into account shared structure across scenes, and is able to infer the structure of a novel scene—without the need to re-train—using amortized inference. NeRF-VAE’s explicit 3D rendering process further contrasts previous generative models with convolution-based rendering which lacks geometric structure. Our model is a VAE that learns a distribution over radiance fields by conditioning them on a latent scene representation. We show that, once trained, NeRF-VAE is able to infer and render geometrically-consistent scenes from previously unseen 3D environments of synthetic scenes using very few input images. We further demonstrate that NeRF-VAE generalizes well to out-of-distribution cameras, while convolutional models do not. Finally, we introduce and study an attention-based conditioning mechanism of NeRF-VAE’s decoder, which improves model performance.}, bibtype = {inproceedings}, author = {Kosiorek, Adam R and Strathmann, Heiko and Zoran, Daniel and Moreno, Pol and Schneider, Rosalia and Mokra, Sona and Rezende, Danilo Jimenez}, booktitle = {Proceedings of the 38th International Conference on Machine Learning} }
@article{ title = {NVAE: A Deep Hierarchical Variational Autoencoder}, type = {article}, year = {2021}, keywords = {Computer Science - Computer Vision and Pattern Re,Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/2007.03898}, month = {1}, id = {040f5984-81fd-351b-aec7-93f7b60352ed}, created = {2022-03-28T09:45:04.637Z}, accessed = {2022-03-22}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:42.278Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {vahdatNVAEDeepHierarchical2021}, source_type = {article}, short_title = {NVAE}, notes = {arXiv: 2007.03898}, private_publication = {false}, abstract = {Normalizing flows, autoregressive models, variational autoencoders (VAEs), and deep energy-based models are among competing likelihood-based frameworks for deep generative learning. Among them, VAEs have the advantage of fast and tractable sampling and easy-to-access encoding networks. However, they are currently outperformed by other models such as normalizing flows and autoregressive models. While the majority of the research in VAEs is focused on the statistical challenges, we explore the orthogonal direction of carefully designing neural architectures for hierarchical VAEs. We propose Nouveau VAE (NVAE), a deep hierarchical VAE built for image generation using depth-wise separable convolutions and batch normalization. NVAE is equipped with a residual parameterization of Normal distributions and its training is stabilized by spectral regularization. We show that NVAE achieves state-of-the-art results among non-autoregressive likelihood-based models on the MNIST, CIFAR-10, CelebA 64, and CelebA HQ datasets and it provides a strong baseline on FFHQ. For example, on CIFAR-10, NVAE pushes the state-of-the-art from 2.98 to 2.91 bits per dimension, and it produces high-quality images on CelebA HQ. To the best of our knowledge, NVAE is the first successful VAE applied to natural images as large as 256\$\textbackslashtimes\$256 pixels. The source code is available at https://github.com/NVlabs/NVAE .}, bibtype = {article}, author = {Vahdat, Arash and Kautz, Jan}, journal = {arXiv:2007.03898 [cs, stat]} }
@article{ title = {Very Deep VAEs Generalize Autoregressive Models and Can Outperform Them on Images}, type = {article}, year = {2021}, keywords = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/2011.10650}, month = {3}, id = {005bef88-e525-3b32-a0b3-5575ecdba094}, created = {2022-03-28T09:45:06.105Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:29.684Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {childVeryDeepVAEs2021}, source_type = {article}, notes = {arXiv: 2011.10650}, private_publication = {false}, abstract = {We present a hierarchical VAE that, for the first time, generates samples quickly while outperforming the PixelCNN in log-likelihood on all natural image benchmarks. We begin by observing that, in theory, VAEs can actually represent autoregressive models, as well as faster, better models if they exist, when made sufficiently deep. Despite this, autoregressive models have historically outperformed VAEs in log-likelihood. We test if insufficient depth explains why by scaling a VAE to greater stochastic depth than previously explored and evaluating it CIFAR-10, ImageNet, and FFHQ. In comparison to the PixelCNN, these very deep VAEs achieve higher likelihoods, use fewer parameters, generate samples thousands of times faster, and are more easily applied to high-resolution images. Qualitative studies suggest this is because the VAE learns efficient hierarchical visual representations. We release our source code and models at https://github.com/openai/vdvae.}, bibtype = {article}, author = {Child, Rewon}, journal = {arXiv:2011.10650 [cs]} }
@inproceedings{ title = {ReZero is all you need: fast convergence at large depth}, type = {inproceedings}, year = {2021}, pages = {1352-1361}, websites = {https://proceedings.mlr.press/v161/bachlechner21a.html}, month = {12}, publisher = {PMLR}, id = {d84b7a5f-5fc7-344e-99b2-816e14d1a21f}, created = {2022-03-28T09:45:06.262Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:36.292Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bachlechnerReZeroAllYou2021}, source_type = {inproceedings}, short_title = {ReZero is all you need}, notes = {ISSN: 2640-3498}, private_publication = {false}, abstract = {Deep networks often suffer from vanishing or exploding gradients due to inefficient signal propagation, leading to long training times or convergence difficulties. Various architecture designs, sophisticated residual-style networks, and initialization schemes have been shown to improve deep signal propagation. Recently, Pennington et al. [2017] used free probability theory to show that dynamical isometry plays an integral role in efficient deep learning. We show that the simplest architecture change of gating each residual connection using a single zero-initialized parameter satisfies initial dynamical isometry and outperforms more complex approaches. Although much simpler than its predecessors, this gate enables training thousands of fully connected layers with fast convergence and better test performance for ResNets trained on an image recognition task. We apply this technique to language modeling and find that we can easily train 120-layer Transformers. When applied to 12 layer Transformers, it converges 56\% faster.}, bibtype = {inproceedings}, author = {Bachlechner, Thomas and Majumder, Bodhisattwa Prasad and Mao, Henry and Cottrell, Gary and McAuley, Julian}, booktitle = {Proceedings of the Thirty-Seventh Conference on Uncertainty in Artificial Intelligence} }
@inproceedings{ title = {3D Semantic Label Transfer in Human-Robot Collaboration}, type = {inproceedings}, year = {2021}, pages = {2602-2611}, websites = {https://openaccess.thecvf.com/content/ICCV2021W/CVinHRC/html/Rozenberszki_3D_Semantic_Label_Transfer_in_Human-Robot_Collaboration_ICCVW_2021_paper.html}, id = {c4c9ba73-1d7b-30bc-a927-8028aa77f395}, created = {2022-03-28T09:45:06.498Z}, accessed = {2021-10-20}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:40.722Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {rozenberszki3DSemanticLabel2021}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Rozenberszki, David and Soros, Gabor and Szeier, Szilvia and Lorincz, Andras} }
@article{ title = {Automated building change detection with amodal completion of point clouds}, type = {article}, year = {2021}, keywords = {Building information modeling,Change detection,Deep learning,Point cloud,Point cloud completion}, pages = {103568}, volume = {124}, websites = {https://www.sciencedirect.com/science/article/pii/S0926580521000194}, month = {4}, id = {8249925c-4de8-394b-8e2e-f7bea0fd31a3}, created = {2022-03-28T09:45:06.711Z}, accessed = {2022-03-22}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:45.489Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {czerniawskiAutomatedBuildingChange2021}, source_type = {article}, private_publication = {false}, abstract = {When updating digital models of existing buildings, changes in the built environment are detected by comparing outdated BIMs with captured point clouds representing current conditions. Here we show that point cloud completion (i.e. automated filling-in of missing data) improves the accuracy of change detection. We perform point cloud completion using a hierarchical deep variational autoencoder (a type of artificial neural network) modified to include skip connections between the convolution and deconvolution layers. The resulting receiver operating characteristic curve shows that completion boosts change detection performance from a total area under the curve of 0.55 to 0.75. Completion achieves this by eliminating differences between the BIM and point cloud inputs that are a consequence of incompleteness while distilling the differences due to building change. We anticipate that automated change detection methods with resilience to imperfect data will become more critical as automated building analyses become increasingly abstracted from data collection.}, bibtype = {article}, author = {Czerniawski, Thomas and Ma, Jong Won and Fernanda Leite, undefined}, doi = {10.1016/j.autcon.2021.103568}, journal = {Automation in Construction} }
@article{ title = {Strategies of attack–defense game for wireless sensor networks considering the effect of confidence level in fuzzy environment}, type = {article}, year = {2021}, keywords = {Fuzzy set,Malware,Reliability analysis approach,Wireless sensor network}, pages = {104238}, volume = {102}, websites = {https://doi.org/10.1016/j.engappai.2021.104238}, publisher = {Elsevier Ltd}, id = {8f099e30-11fb-307b-b709-35d356e5f8b3}, created = {2022-04-05T05:35:07.815Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:07.815Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {It is a common case that Wireless Sensor Networks are attacked by malware in the real world. According to the game theory, the action of attack–defense between Wireless Sensor Network(WSN) and malware can be regarded as a game. While substantial efforts have been made to address this issue, most of these efforts have predominantly focused on the analysis of attack–defense game in the known environment. Given that the process of gaming in real world often contains a lot of fuzzy information, we extend the focus in this line by considering the fuzzy exterior environment. Specifically, we assume the WSN attack–defense Stackelberg game is in the fuzzy environment by using fuzzy variable. Then Stackelberg game theory is utilized to calculate the equilibrium solutions of the introduced maximax chance-constrained model and minimax chance-constrained model. Based on the simulation data, this study demonstrates the confidence levels and decision perspectives affect the optimal strategy of WSN and the reliability of WSN. Finally, the novel analytical method is compared with the non-fuzzy WSN attack–defense game method. The analysis shows that the novel approach is optimal in terms of predicting the behavior of malware in resisting the attack of malware.}, bibtype = {article}, author = {Wu, Yingfu and Kang, Bingyi and Wu, Hao}, doi = {10.1016/j.engappai.2021.104238}, journal = {Engineering Applications of Artificial Intelligence}, number = {April} }
@article{ title = {Regression-based methods for face alignment: A survey}, type = {article}, year = {2021}, keywords = {Face alignment,Facial feature localization,Facial landmarks detection,Regression,Survey}, volume = {178}, id = {c6757daa-bd68-3d66-a352-21e01b14921f}, created = {2022-04-05T05:35:07.934Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.749Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {Face alignment is the process of determining a face shape given its location and size in an image. It is used as a basis for other facial analysis tasks and for human-machine interaction and augmented reality applications. It is a challenging problem due to the extremely high variability in facial appearance affected by many external (illumination, occlusion, head pose) and internal factors (race, facial expression). However, advances in deep learning combined with domain-related knowledge from previous research recently demonstrated impressive results nearly saturating the unconstrained benchmark data sets. The focus is shifting towards reducing the computational burden of the face alignment models since real-time performance is required for such a highly dynamic task. Furthermore, many applications target devices on the edge with limited computational power which puts even greater emphasis on computational efficiency. We present the latest development in regression-based approaches that have led towards nearly solving the face alignment problem in an unconstrained scenario. Various regression architectures are systematically explored and recent training techniques discussed in the context of face alignment. Finally, a benchmark comparison of the most successful methods is presented, taking into account execution time as well, to provide a comprehensive overview of this dynamic research field.}, bibtype = {article}, author = {Gogić, Ivan and Ahlberg, Jörgen and Pandžić, Igor S.}, doi = {10.1016/j.sigpro.2020.107755}, journal = {Signal Processing} }
@article{ title = {A comprehensive survey on 2D multi-person pose estimation methods}, type = {article}, year = {2021}, keywords = {Deep learning,Multi-person pose estimation,Survey}, pages = {104260}, volume = {102}, websites = {https://doi.org/10.1016/j.engappai.2021.104260}, publisher = {Elsevier Ltd}, id = {47ff27f2-6237-309e-a719-e22b4c33ad92}, created = {2022-04-05T05:35:08.081Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.786Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {Human pose estimation is a fundamental yet challenging computer vision task and studied by many researchers around the world in recent years. As a basic task in computer vision, multi-person pose estimation is the core component for many practical applications. This paper extensively reviews recent works on multi-person pose estimation. Specifically, we illustrate and analyze popular methods in detail and compare their pros and cons to fill in the gaps existing in other surveys. In addition, the commonly used datasets, evaluation metrics, and open-source systems are also introduced respectively. Finally, we summarize the development of multi-person pose estimation frameworks and discuss the research trends.}, bibtype = {article}, author = {Wang, Chen and Zhang, Feng and Ge, Shuzhi Sam}, doi = {10.1016/j.engappai.2021.104260}, journal = {Engineering Applications of Artificial Intelligence}, number = {April} }
@article{ title = {Variational graph autoencoders for multiview canonical correlation analysis}, type = {article}, year = {2021}, keywords = {Canonical correlation analysis,Dimensionality reduction,Graph neurals networks,Multiview representation learning,Variational inference}, pages = {108182}, volume = {188}, websites = {https://doi.org/10.1016/j.sigpro.2021.108182}, publisher = {Elsevier B.V.}, id = {3608c6aa-c36f-3a4b-9bbd-3eb90edb125b}, created = {2022-04-05T05:35:08.236Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:14.172Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We present a novel approach for multiview canonical correlation analysis based on a variational graph neural network model. We propose a nonlinear model which takes into account the available graph-based geometric constraints while being scalable to large-scale datasets with multiple views. This model combines the probabilistic interpretation of CCA with an autoencoder architecture based on graph convolutional neural network layers. Experiments with the proposed method are conducted on classification, clustering, and recommendation tasks on real datasets. The algorithm is competitive with state-of-the-art multiview representation learning techniques, in addition to being scalable and robust to instances with missing views.}, bibtype = {article}, author = {Kaloga, Yacouba and Borgnat, Pierre and Chepuri, Sundeep Prabhakar and Abry, Patrice and Habrard, Amaury}, doi = {10.1016/j.sigpro.2021.108182}, journal = {Signal Processing} }
@article{ title = {A comprehensive survey on digital video forensics: Taxonomy, challenges, and future directions}, type = {article}, year = {2021}, keywords = {Anti-forensics,Computer vision (CV),Deep learning (DL),Digital forensics,Evidence extraction,Forgery detection,Legal aspects,Machine learning (ML),Video forensics,Video forgery}, pages = {104456}, volume = {106}, websites = {https://doi.org/10.1016/j.engappai.2021.104456}, publisher = {Elsevier Ltd}, id = {3f503864-0e27-3a81-a5c3-21a385805a70}, created = {2022-04-07T06:10:54.442Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:59.507Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {With the explosive advancements in smartphone technology, video uploading/downloading has become a routine part of digital social networking. Video contents contain valuable information as more incidents are being recorded now than ever before. In this paper, we present a comprehensive survey on information extraction from video contents and forgery detection. In this context, we review various modern techniques such as computer vision and different machine learning (ML) algorithms including deep learning (DL) proposed for video forgery detection. Furthermore, we discuss the persistent general, resource, legal, and technical challenges, as well as challenges in using DL for the problem at hand, such as the theory behind DL, CV, limited datasets, real-time processing, and the challenges with the emergence of ML techniques used with the Internet of Things (IoT)-based heterogeneous devices. Moreover, this survey presents prominent video analysis products used for video forensics investigation and analysis. In summary, this survey provides a detailed and broader investigation about information extraction and forgery detection in video contents under one umbrella, which was not presented yet to the best of our knowledge.}, bibtype = {article}, author = {Javed, Abdul Rehman and Jalil, Zunera and Zehra, Wisha and Gadekallu, Thippa Reddy and Suh, Doug Young and Piran, Md Jalil}, doi = {10.1016/j.engappai.2021.104456}, journal = {Engineering Applications of Artificial Intelligence}, number = {August} }
@article{ title = {Kimera: from SLAM to Spatial Perception with 3D Dynamic Scene Graphs}, type = {article}, year = {2021}, keywords = {Localization,computer vision,mapping,sensing and perception,slam}, pages = {1510-1546}, volume = {40}, websites = {https://arxiv.org/abs/2101.06894v3}, month = {1}, publisher = {SAGE Publications Inc.}, day = {18}, id = {27141204-b6f1-367d-88b1-95617a96caa5}, created = {2022-06-06T05:53:43.828Z}, accessed = {2022-06-06}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-06T05:54:04.302Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {Humans are able to form a complex mental model of the environment they move in. This mental model captures geometric and semantic aspects of the scene, describes the environment at multiple levels of abstractions (e.g., objects, rooms, buildings), includes static and dynamic entities and their relations (e.g., a person is in a room at a given time). In contrast, current robots' internal representations still provide a partial and fragmented understanding of the environment, either in the form of a sparse or dense set of geometric primitives (e.g., points, lines, planes, voxels) or as a collection of objects. This paper attempts to reduce the gap between robot and human perception by introducing a novel representation, a 3D Dynamic Scene Graph(DSG), that seamlessly captures metric and semantic aspects of a dynamic environment. A DSG is a layered graph where nodes represent spatial concepts at different levels of abstraction, and edges represent spatio-temporal relations among nodes. Our second contribution is Kimera, the first fully automatic method to build a DSG from visual-inertial data. Kimera includes state-of-the-art techniques for visual-inertial SLAM, metric-semantic 3D reconstruction, object localization, human pose and shape estimation, and scene parsing. Our third contribution is a comprehensive evaluation of Kimera in real-life datasets and photo-realistic simulations, including a newly released dataset, uHumans2, which simulates a collection of crowded indoor and outdoor scenes. Our evaluation shows that Kimera achieves state-of-the-art performance in visual-inertial SLAM, estimates an accurate 3D metric-semantic mesh model in real-time, and builds a DSG of a complex indoor environment with tens of objects and humans in minutes. Our final contribution shows how to use a DSG for real-time hierarchical semantic path-planning. The core modules in Kimera are open-source.}, bibtype = {article}, author = {Rosinol, Antoni and Violette, Andrew and Abate, Marcus and Hughes, Nathan and Chang, Yun and Shi, Jingnan and Gupta, Arjun and Carlone, Luca}, doi = {10.48550/arxiv.2101.06894}, journal = {International Journal of Robotics Research}, number = {12-14} }
@article{ title = {SGMNet: Learning Rotation-Invariant Point Cloud Representations via Sorted Gram Matrix}, type = {article}, year = {2021}, pages = {10448-10457}, id = {ced97e6c-57e9-342f-92c9-bda70b7a0af9}, created = {2022-06-21T09:01:27.598Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-22T07:14:23.216Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5177186d-8a3a-482d-a36a-536b8090101c}, private_publication = {false}, abstract = {Recently, various works that attempted to introduce rotation invariance to point cloud analysis have devised point-pair features, such as angles and distances. In these methods, however, the point-pair is only comprised of the center point and its adjacent points in a vicinity, which may bring information loss to the local feature representation. In this paper, we instead connect each point densely with all other points in a local neighborhood to compose the point-pairs. Specifically, we present a simple but effective local feature representation, called sorted Gram matrix(SGM), which is not only invariant to arbitrary rotations, but also models the pair-wise relationship of all the points in a neighborhood. In more detail, we utilize vector inner product to model distance- and angle-information between two points, and in a local patch it naturally forms a Gram matrix. In order to guarantee permutation invariance, we sort the correlation value in Gram matrix for each point, therefore this geometric feature names sorted Gram matrix. Furthermore, we mathematically prove that the Gram matrix is rotation-invariant and sufficient to model the inherent structure of a point cloud patch. We then use SGM as features in convolution, which can be readily integrated as a drop-in module into any point-based networks. Finally, we evaluated the proposed method on two widely used datasets, and it outperforms previous state-of-the-arts on both shape classification and part segmentation tasks by a large margin.}, bibtype = {article}, author = {Xu, Jianyun and Tang, Xin and Zhu, Yushi and Sun, Jie and Pu, Shiliang}, doi = {10.1109/ICCV48922.2021.01030}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Survey and Evaluation of RGB-D SLAM}, type = {article}, year = {2021}, keywords = {Computer vision,RGB-D SLAM,evaluation,robotics,survey}, pages = {21367-21387}, volume = {9}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, id = {e4479ef7-eda8-31b5-9637-d73c81949d48}, created = {2022-07-05T15:07:35.924Z}, accessed = {2022-07-05}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-26T06:52:36.096Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {The traditional visual SLAM systems take the monocular or stereo camera as input sensor, with complex map initialization and map point triangulation steps needed for 3D map reconstruction, which are easy to fail, computationally complex and can cause noisy measurements. The emergence of RGB-D camera which provides RGB image together with depth information breaks this situation. While a number of RGB-D SLAM systems have been proposed in recent years, the current classification research on RGB-D SLAM is very lacking, and their advantages and shortcomings remain unclear regarding different applications and perturbations, such as illumination transformation, noise and rolling shutter effect of sensors. In this paper, we mainly introduced the basic concept and structure of the RGB-D SLAM system, and then introduced the differences between the various RGB-D SLAM systems in the three aspects of tracking, mapping, and loop detection, and we make a classification study on different RGB-D SLAM algorithms according to the three aspect. Furthermore, we discuss some advanced topics and open problems of RGB-D SLAM, hoping that it will help for future exploring. In the end, we conducted a large number of evaluation experiments on multiple RGB-D SLAM systems, and analyzed their advantages and disadvantages, as well as performance differences in different application scenarios, and provided references for researchers and developers.}, bibtype = {article}, author = {Zhang, Shishun and Zheng, Longyu and Tao, Wenbing}, doi = {10.1109/ACCESS.2021.3053188}, journal = {IEEE Access} }
@article{ title = {A Closer Look at Rotation-invariant Deep Point Cloud Analysis}, type = {article}, year = {2021}, pages = {16198-16207}, id = {354d05d3-a31e-3678-a228-0f7f6db3d834}, created = {2022-07-28T12:39:24.656Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:35.305Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {We consider the deep point cloud analysis tasks where the inputs of the networks are randomly rotated. Recent progress in rotation-invariant point cloud analysis is mainly driven by converting point clouds into their respective canonical poses, and principal component analysis (PCA) is a practical tool to achieve this. Due to the imperfect alignment of PCA, most of the current works are devoted to developing powerful network structures and features to overcome this deficiency, without thoroughly analyzing the PCA-based canonical poses themselves. In this work, we present a detailed study w.r.t. the PCA-based canonical poses of point clouds. Our investigation reveals that the ambiguity problem associated with the PCA-based canonical poses is handled insufficiently in some recent works. To this end, we develop a simple pose selector module for disambiguation, which presents noticeable enhancement (i.e., 5.3% classification accuracy) over state-of-the-art approaches on the challenging real-world dataset.}, bibtype = {article}, author = {Li, Feiran and Fujiwara, Kent and Okura, Fumio and Matsushita, Yasuyuki}, doi = {10.1109/ICCV48922.2021.01591}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {On Automatic Data Augmentation for 3D Point Cloud Classification}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2112.06029}, id = {0ae43773-f5dd-3dfb-905d-66f2ffd3435c}, created = {2022-08-18T10:51:28.256Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:49.148Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Data augmentation is an important technique to reduce overfitting and improve learning performance, but existing works on data augmentation for 3D point cloud data are based on heuristics. In this work, we instead propose to automatically learn a data augmentation strategy using bilevel optimization. An augmentor is designed in a similar fashion to a conditional generator and is optimized by minimizing a base model's loss on a validation set when the augmented input is used for training the model. This formulation provides a more principled way to learn data augmentation on 3D point clouds. We evaluate our approach on standard point cloud classification tasks and a more challenging setting with pose misalignment between training and validation/test sets. The proposed strategy achieves competitive performance on both tasks and we provide further insight into the augmentor's ability to learn the validation set distribution.}, bibtype = {article}, author = {Zhang, Wanyue and Xu, Xun and Liu, Fayao and Zhang, Le and Foo, Chuan-Sheng} }
@article{ title = {DeepUME: Learning the Universal Manifold Embedding for Robust Point Cloud Registration}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2112.09938}, id = {13038110-a185-3bb5-87c7-09d40681e434}, created = {2022-08-18T10:51:28.260Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:49.139Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Registration of point clouds related by rigid transformations is one of the fundamental problems in computer vision. However, a solution to the practical scenario of aligning sparsely and differently sampled observations in the presence of noise is still lacking. We approach registration in this scenario with a fusion of the closed-form Universal Mani-fold Embedding (UME) method and a deep neural network. The two are combined into a single unified framework, named DeepUME, trained end-to-end and in an unsupervised manner. To successfully provide a global solution in the presence of large transformations, we employ an SO(3)-invariant coordinate system to learn both a joint-resampling strategy of the point clouds and SO(3)-invariant features. These features are then utilized by the geometric UME method for transformation estimation. The parameters of DeepUME are optimized using a metric designed to overcome an ambiguity problem emerging in the registration of symmetric shapes, when noisy scenarios are considered. We show that our hybrid method outperforms state-of-the-art registration methods in various scenarios, and generalizes well to unseen data sets. Our code is publicly available.}, bibtype = {article}, author = {Lang, Natalie and Francos, Joseph M.} }
@article{ title = {Two Heads are Better than One: Geometric-Latent Attention for Point Cloud Classification and Segmentation}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2111.00231}, id = {23a9e521-7d54-31b7-bc0a-31ba054832da}, created = {2022-08-18T10:51:28.268Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:49.134Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We present an innovative two-headed attention layer that combines geometric and latent features to segment a 3D scene into semantically meaningful subsets. Each head combines local and global information, using either the geometric or latent features, of a neighborhood of points and uses this information to learn better local relationships. This Geometric-Latent attention layer (Ge-Latto) is combined with a sub-sampling strategy to capture global features. Our method is invariant to permutation thanks to the use of shared-MLP layers, and it can also be used with point clouds with varying densities because the local attention layer does not depend on the neighbor order. Our proposal is simple yet robust, which allows it to achieve competitive results in the ShapeNetPart and ModelNet40 datasets, and the state-of-the-art when segmenting the complex dataset S3DIS, with 69.2% IoU on Area 5, and 89.7% overall accuracy using K-fold cross-validation on the 6 areas.}, bibtype = {article}, author = {Cuevas-Velasquez, Hanz and Gallego, Antonio Javier and Fisher, Robert B.} }
@article{ title = {Multi-view 3D Reconstruction with Transformers}, type = {article}, year = {2021}, pages = {5702-5711}, id = {b6154851-8cc1-3fb2-95f7-3fe9420298a3}, created = {2022-08-18T10:51:28.274Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:09.339Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6d08e913-c810-48b8-98aa-c6e35d83017e,2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, abstract = {Deep CNN-based methods have so far achieved the state of the art results in multi-view 3D object reconstruction. Despite the considerable progress, the two core modules of these methods - view feature extraction and multi-view fusion, are usually investigated separately, and the relations among multiple input views are rarely explored. Inspired by the recent great success in Transformer models, we reformulate the multi-view 3D reconstruction as a sequence-to-sequence prediction problem and propose a framework named 3D Volume Transformer. Unlike previous CNN-based methods using a separate design, we unify the feature extraction and view fusion in a single Transformer network. A natural advantage of our design lies in the exploration of view-to-view relationships using self-attention among multiple unordered inputs. On ShapeNet - a large-scale 3D reconstruction benchmark, our method achieves a new state-of-the-art accuracy in multi-view reconstruction with fewer parameters (70% less) than CNN-based methods. Experimental results also suggest the strong scaling capability of our method. Our code will be made publicly available.}, bibtype = {article}, author = {Wang, Dan and Cui, Xinrui and Chen, Xun and Zou, Zhengxia and Shi, Tianyang and Salcudean, Septimiu and Wang, Z. Jane and Ward, Rabab}, doi = {10.1109/ICCV48922.2021.00567}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Fooling LiDAR Perception via Adversarial Trajectory Perturbation}, type = {article}, year = {2021}, pages = {7878-7887}, id = {e2ee21cd-6241-3d90-959f-a411e40a3a74}, created = {2022-08-18T10:51:28.385Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:07.199Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {13ed0c6c-c963-4099-91df-77a297ea5770,db36ed60-3b58-424a-b9a4-a9c7322975f3}, private_publication = {false}, abstract = {LiDAR point clouds collected from a moving vehicle are functions of its trajectories, because the sensor motion needs to be compensated to avoid distortions. When autonomous vehicles are sending LiDAR point clouds to deep networks for perception and planning, could the motion compensation consequently become a wide-open backdoor in those networks, due to both the adversarial vulnerability of deep learning and GPS-based vehicle trajectory estimation that is susceptible to wireless spoofing? We demonstrate such possibilities for the first time: instead of directly attacking point cloud coordinates which requires tampering with the raw LiDAR readings, only adversarial spoofing of a self-driving car's trajectory with small perturbations is enough to make safety-critical objects undetectable or detected with incorrect positions. Moreover, polynomial trajectory perturbation is developed to achieve a temporally-smooth and highly-imperceptible attack. Extensive experiments on 3D object detection have shown that such attacks not only lower the performance of the state-of-the-art detectors effectively, but also transfer to other detectors, raising a red flag for the community. The code is available on https://ai4ce.github.io/FLAT/.}, bibtype = {article}, author = {Li, Yiming and Wen, Congcong and Juefei-Xu, Felix and Feng, Chen}, doi = {10.1109/ICCV48922.2021.00780}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {VIN: Voxel-based Implicit Network for Joint 3D Object Detection and Segmentation for Lidars}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2107.02980}, id = {f3f19d1e-2d14-3930-9150-b1020aac9002}, created = {2022-08-18T10:51:28.407Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:51:32.460Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {A unified neural network structure is presented for joint 3D object detection and point cloud segmentation in this paper. We leverage rich supervision from both detection and segmentation labels rather than using just one of them. In addition, an extension based on single-stage object detectors is proposed based on the implicit function widely used in 3D scene and object understanding. The extension branch takes the final feature map from the object detection module as input, and produces an implicit function that generates semantic distribution for each point for its corresponding voxel center. We demonstrated the performance of our structure on nuScenes-lidarseg, a large-scale outdoor dataset. Our solution achieves competitive results against state-of-the-art methods in both 3D object detection and point cloud segmentation with little additional computation load compared with object detection solutions. The capability of efficient weakly supervision semantic segmentation of the proposed method is also validated by experiments.}, bibtype = {article}, author = {Zhong, Yuanxin and Zhu, Minghan and Peng, Huei} }
@article{ title = {Multi-Modality Task Cascade for 3D Object Detection}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2107.04013}, id = {89cdca43-d547-30b0-bdee-9651e5951668}, created = {2022-08-18T10:51:28.419Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:51:35.480Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Point clouds and RGB images are naturally complementary modalities for 3D visual understanding - the former provides sparse but accurate locations of points on objects, while the latter contains dense color and texture information. Despite this potential for close sensor fusion, many methods train two models in isolation and use simple feature concatenation to represent 3D sensor data. This separated training scheme results in potentially sub-optimal performance and prevents 3D tasks from being used to benefit 2D tasks that are often useful on their own. To provide a more integrated approach, we propose a novel Multi-Modality Task Cascade network (MTC-RCNN) that leverages 3D box proposals to improve 2D segmentation predictions, which are then used to further refine the 3D boxes. We show that including a 2D network between two stages of 3D modules significantly improves both 2D and 3D task performance. Moreover, to prevent the 3D module from over-relying on the overfitted 2D predictions, we propose a dual-head 2D segmentation training and inference scheme, allowing the 2nd 3D module to learn to interpret imperfect 2D segmentation predictions. Evaluating our model on the challenging SUN RGB-D dataset, we improve upon state-of-the-art results of both single modality and fusion networks by a large margin ($\textbf+3.8$ mAP@0.5). Code will be released $\hrefhttps://github.com/Divadi/MTC_RCNN\texthere.$}, bibtype = {article}, author = {Park, Jinhyung and Weng, Xinshuo and Man, Yunze and Kitani, Kris} }
@article{ title = {Self-Supervised Point Cloud Completion via Inpainting}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2111.10701}, id = {172063bb-e8bc-3cfd-b52d-b4f569146a8e}, created = {2022-08-18T10:53:48.475Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:09.676Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6d08e913-c810-48b8-98aa-c6e35d83017e,2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, abstract = {When navigating in urban environments, many of the objects that need to be tracked and avoided are heavily occluded. Planning and tracking using these partial scans can be challenging. The aim of this work is to learn to complete these partial point clouds, giving us a full understanding of the object's geometry using only partial observations. Previous methods achieve this with the help of complete, ground-truth annotations of the target objects, which are available only for simulated datasets. However, such ground truth is unavailable for real-world LiDAR data. In this work, we present a self-supervised point cloud completion algorithm, PointPnCNet, which is trained only on partial scans without assuming access to complete, ground-truth annotations. Our method achieves this via inpainting. We remove a portion of the input data and train the network to complete the missing region. As it is difficult to determine which regions were occluded in the initial cloud and which were synthetically removed, our network learns to complete the full cloud, including the missing regions in the initial partial cloud. We show that our method outperforms previous unsupervised and weakly-supervised methods on both the synthetic dataset, ShapeNet, and real-world LiDAR dataset, Semantic KITTI.}, bibtype = {article}, author = {Mittal, Himangi and Okorn, Brian and Jangid, Arpit and Held, David} }
@article{ title = {Adversarial Robustness Comparison of Vision Transformer and MLP-Mixer to CNNs}, type = {article}, year = {2021}, pages = {1-16}, websites = {http://arxiv.org/abs/2110.02797}, id = {c310377b-b68d-3775-87d6-402a4f9e6208}, created = {2022-08-18T10:53:48.488Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:57.679Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Convolutional Neural Networks (CNNs) have become the de facto gold standard in computer vision applications in the past years. Recently, however, new model architectures have been proposed challenging the status quo. The Vision Transformer (ViT) relies solely on attention modules, while the MLP-Mixer architecture substitutes the self-attention modules with Multi-Layer Perceptrons (MLPs). Despite their great success, CNNs have been widely known to be vulnerable to adversarial attacks, causing serious concerns for security-sensitive applications. Thus, it is critical for the community to know whether the newly proposed ViT and MLP-Mixer are also vulnerable to adversarial attacks. To this end, we empirically evaluate their adversarial robustness under several adversarial attack setups and benchmark them against the widely used CNNs. Overall, we find that the two architectures, especially ViT, are more robust than their CNN models. Using a toy example, we also provide empirical evidence that the lower adversarial robustness of CNNs can be partially attributed to their shift-invariant property. Our frequency analysis suggests that the most robust ViT architectures tend to rely more on low-frequency features compared with CNNs. Additionally, we have an intriguing finding that MLP-Mixer is extremely vulnerable to universal adversarial perturbations.}, bibtype = {article}, author = {Benz, Philipp and Ham, Soomin and Zhang, Chaoning and Karjauv, Adil and Kweon, In So} }
@article{ title = {Self-Supervised Pretraining of 3D Features on any Point-Cloud}, type = {article}, year = {2021}, pages = {10232-10243}, id = {17adf16e-0e21-374d-a286-b891a8ec5982}, created = {2022-08-18T10:53:48.615Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:46.690Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Pretraining on large labeled datasets is a prerequisite to achieve good performance in many computer vision tasks like image recognition, video understanding etc. However, pretraining is not widely used for 3D recognition tasks where state-of-the-art methods train models from scratch. A primary reason is the lack of large annotated datasets because 3D data labelling is time-consuming. Recent work shows that self-supervised learning is useful to pretrain models in 3D but requires multi-view data and point correspondences. We present a simple self-supervised pretraining method that can work with single-view depth scans acquired by varied sensors, without 3D registration and point correspondences. We pretrain standard point cloud and voxel based model architectures, and show that joint pretraining further improves performance. We evaluate our models on 9 benchmarks for object detection, semantic segmentation, and object classification, where they achieve state-of-the-art results. Most notably, we set a new state-of-the-art for object detection on ScanNet (69.0% mAP) and SUNRGBD (63.5% mAP). Our pretrained models are label efficient and improve performance for classes with few examples.}, bibtype = {article}, author = {Zhang, Zaiwei and Girdhar, Rohit and Joulin, Armand and Misra, Ishan}, doi = {10.1109/ICCV48922.2021.01009}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Imperceptible Transfer Attack and Defense on 3D Point Cloud Classification}, type = {article}, year = {2021}, volume = {14}, websites = {http://arxiv.org/abs/2111.10990}, publisher = {IEEE}, id = {5c1a1c6e-2055-33e4-b7d3-e93584007f46}, created = {2022-08-18T10:53:48.620Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:07.019Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8f8f5505-1a28-42dd-a82c-92b5738465f1,13ed0c6c-c963-4099-91df-77a297ea5770,db36ed60-3b58-424a-b9a4-a9c7322975f3}, private_publication = {false}, abstract = {Although many efforts have been made into attack and defense on the 2D image domain in recent years, few methods explore the vulnerability of 3D models. Existing 3D attackers generally perform point-wise perturbation over point clouds, resulting in deformed structures or outliers, which is easily perceivable by humans. Moreover, their adversarial examples are generated under the white-box setting, which frequently suffers from low success rates when transferred to attack remote black-box models. In this paper, we study 3D point cloud attacks from two new and challenging perspectives by proposing a novel Imperceptible Transfer Attack (ITA): 1) Imperceptibility: we constrain the perturbation direction of each point along its normal vector of the neighborhood surface, leading to generated examples with similar geometric properties and thus enhancing the imperceptibility. 2) Transferability: we develop an adversarial transformation model to generate the most harmful distortions and enforce the adversarial examples to resist it, improving their transferability to unknown black-box models. Further, we propose to train more robust black-box 3D models to defend against such ITA attacks by learning more discriminative point cloud representations. Extensive evaluations demonstrate that our ITA attack is more imperceptible and transferable than state-of-the-arts and validate the superiority of our defense strategy.}, bibtype = {article}, author = {Liu, Daizong and Hu, Wei}, doi = {10.1109/TPAMI.2022.3193449}, number = {8} }
@article{ title = {Local and Global Point Cloud Reconstruction for 3D Hand Pose Estimation}, type = {article}, year = {2021}, pages = {1-15}, websites = {http://arxiv.org/abs/2112.06389}, id = {3b1ccf0a-b63e-3fbe-84e9-9b029fc0ec65}, created = {2022-08-18T10:53:48.643Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:04.422Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This paper addresses the 3D point cloud reconstruction and 3D pose estimation of the human hand from a single RGB image. To that end, we present a novel pipeline for local and global point cloud reconstruction using a 3D hand template while learning a latent representation for pose estimation. To demonstrate our method, we introduce a new multi-view hand posture dataset to obtain complete 3D point clouds of the hand in the real world. Experiments on our newly proposed dataset and four public benchmarks demonstrate the model's strengths. Our method outperforms competitors in 3D pose estimation while reconstructing realistic-looking complete 3D hand point clouds.}, bibtype = {article}, author = {Yu, Ziwei and Yang, Linlin and Chen, Shicheng and Yao, Angela} }
@article{ title = {Point-set Distances for Learning Representations of 3D Point Clouds}, type = {article}, year = {2021}, pages = {10458-10467}, id = {7599bbdc-6bd0-3ff1-86eb-ff00c24de34d}, created = {2022-08-18T10:53:48.645Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:32.663Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {034ae31f-a548-45de-8507-3cbbc9e326ad,07e07de9-bcac-4934-a82b-d0aff540e56d}, private_publication = {false}, abstract = {Learning an effective representation of 3D point clouds requires a good metric to measure the discrepancy between two 3D point sets, which is non-trivial due to their irregularity. Most of the previous works resort to using the Chamfer discrepancy or Earth Mover's distance, but those metrics are either ineffective in measuring the differences between point clouds or computationally expensive. In this paper, we conduct a systematic study with extensive experiments on distance metrics for 3D point clouds. From this study, we propose to use sliced Wasserstein distance and its variants for learning representations of 3D point clouds. In addition, we introduce a new algorithm to estimate sliced Wasserstein distance that guarantees that the estimated value is close enough to the true one. Experiments show that the sliced Wasserstein distance and its variants allow the neural network to learn a more efficient representation compared to the Chamfer discrepancy. We demonstrate the efficiency of the sliced Wasserstein metric and its variants on several tasks in 3D computer vision including training a point cloud autoencoder, generative modeling, transfer learning, and point cloud registration.}, bibtype = {article}, author = {Nguyen, Trung and Pham, Quang Hieu and Le, Tam and Pham, Tung and Ho, Nhat and Hua, Binh Son}, doi = {10.1109/ICCV48922.2021.01031}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {Section 4} }
@article{ title = {TSGCNet: Discriminative Geometric Feature Learning with Two-Stream Graph Convolutional Network for 3D Dental Model Segmentation}, type = {article}, year = {2021}, pages = {6695-6704}, id = {2fb60ce6-ca28-35b2-9af8-6b4a4b89a37b}, created = {2022-08-18T10:53:48.648Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:43.999Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The ability to segment teeth precisely from digitized 3D dental models is an essential task in computer-aided orthodontic surgical planning. To date, deep learning based methods have been popularly used to handle this task. State-of-the-art methods directly concatenate the raw attributes of 3D inputs, namely coordinates and normal vectors of mesh cells, to train a single-stream network for fully-automated tooth segmentation. This, however, has the drawback of ignoring the different geometric meanings provided by those raw attributes. This issue might possibly confuse the network in learning discriminative geometric features and result in many isolated false predictions on the dental model. Against this issue, we propose a two-stream graph convolutional network (TSGCNet) to learn multi-view geometric information from different geometric attributes. Our TSGCNet adopts two graph-learning streams, designed in an input-aware fashion, to extract more discriminative high-level geometric representations from coordinates and normal vectors, respectively. These feature representations learned from the designed two different streams are further fused to integrate the multi-view complementary information for the cell-wise dense prediction task. We evaluate our proposed TSGCNet on a real-patient dataset of dental models acquired by 3D intraoral scanners, and experimental results demonstrate that our method significantly outperforms state-of-the-art methods for 3D shape segmentation.}, bibtype = {article}, author = {Zhang, Lingming and Zhao, Yue and Meng, Deyu and Cui, Zhiming and Gao, Chenqiang and Gao, Xinbo and Lian, Chunfeng and Shen, Dinggang}, doi = {10.1109/CVPR46437.2021.00663}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Dganet: A dilated graph attention-based network for local feature extraction on 3d point clouds}, type = {article}, year = {2021}, keywords = {3D point clouds,Deep learning,Graph attention mechanism,Local feature extraction}, volume = {13}, id = {62b96fa6-6267-31ca-ac0e-991539308641}, created = {2022-08-18T10:53:48.756Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:41.647Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Feature extraction on point clouds is an essential task when analyzing and processing point clouds of 3D scenes. However, there still remains a challenge to adequately exploit local fine-grained features on point cloud data due to its irregular and unordered structure in a 3D space. To alleviate this problem, a Dilated Graph Attention-based Network (DGANet) with a certain feature for learning ability is proposed. Specifically, we first build a local dilated graph-like region for each input point to establish the long-range spatial correlation towards its corresponding neighbors, which allows the proposed network to access a wider range of geometric information of local points with their long-range dependencies. Moreover, by integrating the dilated graph attention module (DGAM) implemented by a novel offset–attention mechanism, the proposed network promises to highlight the differing importance on each edge of the constructed local graph to uniquely learn the discrepancy feature of geometric attributes between the connected point pairs. Finally, all the learned edge attention features are further aggregated, allowing the most significant geometric feature representation of local regions by the graph–attention pooling to fully extract local detailed features for each point. The validation experiments using two challenging benchmark datasets demonstrate the effectiveness and powerful generation ability of our proposed DGANet in both 3D object classification and segmentation tasks.}, bibtype = {article}, author = {Wan, Jie and Xie, Zhong and Xu, Yongyang and Zeng, Ziyin and Yuan, Ding and Qiu, Qinjun}, doi = {10.3390/rs13173484}, journal = {Remote Sensing}, number = {17} }
@article{ title = {Group-Free 3D Object Detection via Transformers}, type = {article}, year = {2021}, pages = {2929-2938}, id = {d4973e16-1792-3328-988f-3549049a3abb}, created = {2022-08-18T10:53:48.769Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:38.733Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers [42], where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D. The code and models are publicly available at https://github.com/zeliu98/Group-Free-3D.}, bibtype = {article}, author = {Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin}, doi = {10.1109/ICCV48922.2021.00294}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {3D point cloud semantic segmentation toward large-scale unstructured agricultural scene classification}, type = {article}, year = {2021}, keywords = {Deep learning,Point clouds,Scene classification,Semantic segmentation,Unstructured agricultural scene}, pages = {106445}, volume = {190}, websites = {https://doi.org/10.1016/j.compag.2021.106445}, publisher = {Elsevier B.V.}, id = {9e04d092-64ca-397f-8697-47bb28864f05}, created = {2022-08-18T10:53:48.843Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:18.136Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In recent years, with the development of computer vision, deep learning, and artificial intelligence technologies, the popularity of depth sensors and lidar has promoted the rapid development of three-dimensional (3D) point cloud semantic segmentation. The semantic segmentation of 3D point clouds for large-scale unstructured agricultural scenes is important for agricultural robots to perceive their surrounding environment, and for autonomous navigation and positioning and autonomous scene understanding. In this study, the problem of 3D point cloud semantic segmentation for large-scale unstructured agricultural scenes was studied. By improving the neural network structure of RandLA-Net, a deeper 3D point cloud semantic segmentation neural network model for large-scale unstructured agricultural scenes was built, and good experimental results were obtained. The local feature aggregation module in RandLA-Net was integrated and improved to achieve 3D point cloud semantic segmentation for large-scale unstructured agricultural scenes. To test the influence of the 3D point cloud sampling algorithm on the overall accuracy (OA) and mean intersection-over-union (mIoU) of semantic segmentation, the random sampling algorithm and farthest point sampling algorithm were used to build two models with the same neural network structure. The test results show that the sampling algorithm has little effect on the OA and mIoU of 3D point cloud semantic segmentation, and the final result depends mainly on the extraction of 3D point cloud features. In addition, two different Semantic3D datasets were used to test the effect of the datasets on the generalization ability of the model, and the results showed that the datasets had an important effect on the neural network model.}, bibtype = {article}, author = {Chen, Yi and Xiong, Yingjun and Zhang, Baohua and Zhou, Jun and Zhang, Qian}, doi = {10.1016/j.compag.2021.106445}, journal = {Computers and Electronics in Agriculture}, number = {August} }
@article{ title = {Airborne LiDAR point cloud classification with global-local graph attention convolution neural network}, type = {article}, year = {2021}, keywords = {Airborne LiDAR,Graph attention convolution,ISPRS 3D labeling,Point cloud classification,Point cloud deep learning}, pages = {181-194}, volume = {173}, websites = {https://doi.org/10.1016/j.isprsjprs.2021.01.007}, publisher = {Elsevier B.V.}, id = {d294c11e-c819-3ede-a1f8-b5e66440c279}, created = {2022-08-18T10:53:48.876Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:23.513Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Airborne light detection and ranging (LiDAR) plays an increasingly significant role in urban planning, topographic mapping, environmental monitoring, power line detection and other fields thanks to its capability to quickly acquire large-scale and high-precision ground information. To achieve point cloud classification, previous studies proposed point cloud deep learning models that can directly process raw point clouds based on PointNet-like architectures. And some recent works proposed graph convolution neural network based on the inherent topology of point clouds. However, the above point cloud deep learning models only pay attention to exploring local geometric structures, yet ignore global contextual relationships among all points. In this paper, we present a global-local graph attention convolution neural network (GACNN) that can be directly applied to the classification of unstructured 3D point clouds obtained by airborne LiDAR. Specifically, we first introduce a graph attention convolution module that incorporates global contextual information and local structural features. The global attention module examines spatial relationships among all points, while the local attention module can dynamically learn convolution weights with regard to the spatial position of the local neighboring points and reweight the convolution weights by inspecting the density of each local region. Based on the proposed graph attention convolution module, we further design an end-to-end encoder-decoder network, named GACNN, to capture multiscale features of the point clouds and therefore enable more accurate airborne point cloud classification. Experiments on the ISPRS 3D labeling dataset show that the proposed model achieves a new state-of-the-art performance in terms of average F1 score (71.5%) and a satisfying overall accuracy (83.2%). Additionally, experiments further conducted on the 2019 Data Fusion Contest Dataset by comparing with other prevalent point cloud deep learning models demonstrate the favorable generalization capability of the proposed model.}, bibtype = {article}, author = {Wen, Congcong and Li, Xiang and Yao, Xiaojing and Peng, Ling and Chi, Tianhe}, doi = {10.1016/j.isprsjprs.2021.01.007}, journal = {ISPRS Journal of Photogrammetry and Remote Sensing}, number = {January} }
@article{ title = {FinerPCN: High fidelity point cloud completion network using pointwise convolution}, type = {article}, year = {2021}, keywords = {3D point cloud,Deep learning,Point analysis,Point completion network,Shape completion}, pages = {266-276}, volume = {460}, websites = {https://doi.org/10.1016/j.neucom.2021.06.080}, publisher = {Elsevier}, id = {6ed47f61-aa8a-3938-9e06-e1873bc072a1}, created = {2022-08-18T10:53:48.876Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:09.514Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6d08e913-c810-48b8-98aa-c6e35d83017e,2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, abstract = {3D scanners often obtain partial point clouds due to occlusion and limitation of viewing angles. Point cloud completion aims at inferring the full shape of an object from an incomplete point set. Existing deep learning models either do not consider local information or easily degrade the sharp details of the input, thereby losing some existing structures. In this paper, we propose a high fidelity point cloud completion network using pointwise convolution, called FinerPCN. FinerPCN generates complete and fine point clouds in a coarse-to-fine manner. FinerPCN consists of two subnetworks: an encoder-decoder for generating a coarse shape and pointwise convolution for refining its local structure. By repeatedly feeding partial input into the second subnetwork, FinerPCN effectively considers local information and alleviates structural blur of input while maintaining global shape. Experimental results show that FinerPCN generates finer detailed completion results than state-of-the-art methods while successfully keeping the shape of the input.}, bibtype = {article}, author = {Chang, Yakun and Jung, Cheolkon and Xu, Yuanquan}, doi = {10.1016/j.neucom.2021.06.080}, journal = {Neurocomputing} }
@article{ title = {HSGAN: Hierarchical Graph Learning for Point Cloud Generation}, type = {article}, year = {2021}, keywords = {GAN,Generative Adversarial Network,gradient penalty,graph learning,machine learning,point cloud generation,self-attention}, pages = {4540-4554}, volume = {30}, id = {54f4f79c-9111-39e5-899f-1bf7f6391293}, created = {2022-08-18T10:53:48.893Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:28.761Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8f8f5505-1a28-42dd-a82c-92b5738465f1,b6d75013-efe2-4ddc-b3db-65496bd4db9f,a6a80a30-e9a2-486d-8032-eac3fd981996,df1ae3c5-d972-4eac-9fff-d018bc64f4a9,244f8db2-6bd4-47d9-8abf-425a263fd4d1}, private_publication = {false}, abstract = {Point clouds are the most general data representations of real and abstract objects, and have a wide variety of applications in many science and engineering fields. Point clouds also provide the most scalable multi-resolution composition for geometric structures. Although point cloud learning has shown remarkable results in shape estimation and semantic segmentation, the unsupervised generation of 3D object parts still pose significant challenges in the 3D shape understanding problem. We address this problem by proposing a novel Generative Adversarial Network (GAN), named HSGAN, or Hierarchical Self-Attention GAN, with remarkable properties for 3D shape generation. Our generative model takes a random code and hierarchically transforms it into a representation graph by incorporating both Graph Convolution Network (GCN) and self-attention. With embedding the global graph topology in shape generation, the proposed model takes advantage of the latent topological information to fully construct the geometry of 3D object shapes. Different from the existing generative pipelines, our deep learning architecture articulates three significant properties HSGAN effectively deploys the compact latent topology information as a graph representation in the generative learning process and generates realistic point clouds, HSGAN avoids multiple discriminator updates per generator update, and HSGAN preserves the most dominant geometric structures of 3D shapes in the same hierarchical sampling process. We demonstrate the performance of our new approach with both quantitative and qualitative evaluations. We further present a new adversarial loss to maintain the training stability and overcome the potential mode collapse of traditional GANs. Finally, we explore the use of HSGAN as a plug-and-play decoder in the auto-encoding architecture.}, bibtype = {article}, author = {Li, Yushi and Baciu, George}, doi = {10.1109/TIP.2021.3073318}, journal = {IEEE Transactions on Image Processing} }
@article{ title = {Real-Time Volumetric-Semantic Exploration and Mapping : An Uncertainty-Aware Approach}, type = {article}, year = {2021}, pages = {9064-9070}, publisher = {IEEE}, id = {fe70cc72-29b1-3cd4-a945-cdad3f983893}, created = {2022-08-19T11:38:34.703Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:09.839Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6d08e913-c810-48b8-98aa-c6e35d83017e,2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, bibtype = {article}, author = {Figueiredo, Rui Pimentel De and Sejersen, Fevre and Hansen, Jakob Grimm and Brand, Martim} }
@article{ title = {Learning Graph Representation with Generative Adversarial Nets}, type = {article}, year = {2021}, keywords = {Graph representation learning,generative adversarial nets,graph softmax}, pages = {3090-3103}, volume = {33}, id = {5eb5a40d-da22-34e3-b6d2-c66d7803393f}, created = {2022-08-19T12:56:19.270Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:03.415Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph representation learning aims to embed each vertex in a graph into a low-dimensional vector space. Existing graph representation learning methods can be classified into two categories: generative models that learn the underlying connectivity distribution in a graph, and discriminative models that predict the probability of edge between a pair of vertices. In this paper, we propose GraphGAN, an innovative graph representation learning framework unifying the above two classes of methods, in which the generative and the discriminative model play a game-Theoretical minimax game. Specifically, for a given vertex, the generative model tries to fit its underlying true connectivity distribution over all other vertices and produces 'fake' samples to fool the discriminative model, while the discriminative model tries to detect whether the sampled vertex is from ground truth or generated by the generative model. With the competition between these two models, both of them can alternately and iteratively boost their performance. Moreover, we propose a novel graph softmax as the implementation of the generative model to overcome the limitations of traditional softmax function, which can be proven satisfying desirable properties of normalization, graph structure awareness, and computational efficiency. Through extensive experiments on real-world datasets, we demonstrate that GraphGAN achieves substantial gains in a variety of applications, including graph reconstruction, link prediction, node classification, recommendation, and visualization, over state-of-The-Art baselines.}, bibtype = {article}, author = {Wang, Hongwei and Wang, Jialin and Wang, Jia and Zhao, Miao and Zhang, Weinan and Zhang, Fuzheng and Li, Wenjie and Xie, Xing and Guo, Minyi}, doi = {10.1109/TKDE.2019.2961882}, journal = {IEEE Transactions on Knowledge and Data Engineering}, number = {8} }
@article{ title = {Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model}, type = {article}, year = {2021}, pages = {1-21}, id = {7dccf841-0a6c-31c7-944a-784b40039423}, created = {2022-09-06T11:37:21.054Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:10.051Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6d08e913-c810-48b8-98aa-c6e35d83017e,2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, bibtype = {article}, author = {Schrittwieser, Julian and Antonoglou, Ioannis and Hubert, Thomas and Simonyan, Karen and Sifre, Laurent and Schmitt, Simon and Guez, Arthur and Lockhart, Edward and Hassabis, Demis and Graepel, Thore and Lillicrap, Timothy} }
@article{ title = {Real-Time Volumetric-Semantic Exploration and Mapping : An Uncertainty-Aware Approach}, type = {article}, year = {2021}, pages = {9064-9070}, publisher = {IEEE}, id = {1017349e-d755-30ec-9c2e-18fa53355aab}, created = {2022-09-06T11:37:21.072Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-06T11:37:25.824Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Figueiredo, Rui Pimentel De and Sejersen, Fevre and Hansen, Jakob Grimm and Brand, Martim} }
@article{ title = {Next-best-view regression using a 3D convolutional neural network}, type = {article}, year = {2021}, keywords = {3D modeling,Deep learning,Next-best-view,Object reconstruction,Range sensing}, pages = {1-14}, volume = {32}, websites = {https://doi.org/10.1007/s00138-020-01166-2}, publisher = {Springer Berlin Heidelberg}, id = {49c1967c-1f8f-35a4-b05d-c90239267586}, created = {2022-09-08T11:24:59.063Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:19.245Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6d08e913-c810-48b8-98aa-c6e35d83017e,2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, abstract = {Automated three-dimensional (3D) object reconstruction is the task of building a geometric representation of a physical object by means of sensing its surface. Even though new single-view reconstruction techniques can predict the surface, they lead to incomplete models, specially, for non-commons objects such as antique objects or art sculptures. Therefore, to achieve the task’s goals, it is essential to automatically determine the locations where the sensor will be placed so that the surface will be completely observed. This problem is known as the next-best-view problem. In this paper, we propose a data-driven approach to address the problem. The proposed approach trains a 3D convolutional neural network (3D CNN) with previous reconstructions in order to regress the position of the next-best-view. To the best of our knowledge, this is one of the first works that directly infers the next-best-view in a continuous space using a data-driven approach for the 3D object reconstruction task. We have validated the proposed approach making use of two groups of experiments. In the first group, several variants of the proposed architecture are analyzed. Predicted next-best-views were observed to be closely positioned to the ground truth. In the second group of experiments, the proposed approach is requested to reconstruct several unseen objects, namely, objects not considered by the 3D CNN during training nor validation. Coverage percentages of up to 90 % were observed. With respect to current state-of-the-art methods, the proposed approach improves the performance of previous next-best-view classification approaches and it is quite fast in running time (3 frames per second), given that it does not compute the expensive ray tracing required by previous information metrics.}, bibtype = {article}, author = {Vasquez-Gomez, J. Irving and Troncoso, David and Becerra, Israel and Sucar, Enrique and Murrieta-Cid, Rafael}, doi = {10.1007/s00138-020-01166-2}, journal = {Machine Vision and Applications}, number = {2} }
@article{ title = {SE-MD: A Single-encoder multiple-decoder deep network for point cloud generation from 2D images}, type = {article}, year = {2021}, keywords = {2d images,3d convolutional,3d model reconstruction,3d shape generation,point clouds,shapenet}, websites = {http://arxiv.org/abs/2106.15325}, id = {d7f4f9b6-bd85-3082-9b76-a601e02bc716}, created = {2022-09-13T13:57:40.424Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-23T10:14:51.115Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {255910b9-b737-4c31-858e-6de1dca0cdb9,a6a80a30-e9a2-486d-8032-eac3fd981996}, private_publication = {false}, abstract = {3D model generation from single 2D RGB images is a challenging and actively researched computer vision task. Various techniques using conventional network architectures have been proposed for the same. However, the body of research work is limited and there are various issues like using inefficient 3D representation formats, weak 3D model generation backbones, inability to generate dense point clouds, dependence of post-processing for generation of dense point clouds, and dependence on silhouettes in RGB images. In this paper, a novel 2D RGB image to point cloud conversion technique is proposed, which improves the state of art in the field due to its efficient, robust and simple model by using the concept of parallelization in network architecture. It not only uses the efficient and rich 3D representation of point clouds, but also uses a novel and robust point cloud generation backbone in order to address the prevalent issues. This involves using a single-encoder multiple-decoder deep network architecture wherein each decoder generates certain fixed viewpoints. This is followed by fusing all the viewpoints to generate a dense point cloud. Various experiments are conducted on the technique and its performance is compared with those of other state of the art techniques and impressive gains in performance are demonstrated. Code is available at https://github.com/mueedhafiz1982/}, bibtype = {article}, author = {Hafiz, Abdul Mueed and Bhat, Rouf Ul Alam and Parah, Shabir Ahmad and Hassaballah, M.} }
@article{ title = {PU-GCN: Point Cloud Upsampling using Graph Convolutional Networks}, type = {article}, year = {2021}, pages = {11678-11687}, id = {300e8555-d5c5-379d-a678-fe6667e901ac}, created = {2022-09-19T07:34:03.592Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:10.334Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,a6a80a30-e9a2-486d-8032-eac3fd981996}, private_publication = {false}, abstract = {The effectiveness of learning-based point cloud upsampling pipelines heavily relies on the upsampling modules and feature extractors used therein. For the point upsampling module, we propose a novel model called NodeShuffle, which uses a Graph Convolutional Network (GCN) to better encode local point information from point neighborhoods. NodeShuffle is versatile and can be incorporated into any point cloud upsampling pipeline. Extensive experiments show how NodeShuffle consistently improves state-of-the-art upsampling methods. For feature extraction, we also propose a new multi-scale point feature extractor, called Inception DenseGCN. By aggregating features at multiple scales, this feature extractor enables further performance gain in the final upsampled point clouds. We combine Inception DenseGCN with NodeShuffle into a new point upsampling pipeline called PU-GCN. PU-GCN sets new state-of-art performance with much fewer parameters and more efficient inference. Our code is publicly available at https://github.com/guochengqian/PU-GCN.}, bibtype = {article}, author = {Qian, Guocheng and Abualshour, Abdulellah and Li, Guohao and Thabet, Ali and Ghanem, Bernard}, doi = {10.1109/CVPR46437.2021.01151}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Learning Progressive Point Embeddings for 3D Point Cloud Generation}, type = {article}, year = {2021}, pages = {10261-10270}, id = {d0c5a913-61e2-34c9-b08d-4f186b9c5d9e}, created = {2022-09-21T09:29:20.057Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:10.260Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,a6a80a30-e9a2-486d-8032-eac3fd981996}, private_publication = {false}, abstract = {Generative models for 3D point clouds are extremely important for scene/object reconstruction applications in autonomous driving and robotics. Despite recent success of deep learning-based representation learning, it remains a great challenge for deep neural networks to synthesize or reconstruct high-fidelity point clouds, because of the difficulties in 1) learning effective pointwise representations; and 2) generating realistic point clouds from complex distributions. In this paper, we devise a dual-generators framework for point cloud generation, which generalizes vanilla generative adversarial learning framework in a progressive manner. Specifically, the first generator aims to learn effective point embeddings in a breadth-first manner, while the second generator is used to refine the generated point cloud based on a depth-first point embedding to generate a robust and uniform point cloud. The proposed dual-generators framework thus is able to progressively learn effective point embeddings for accurate point cloud generation. Experimental results on a variety of object categories from the most popular point cloud generation dataset, ShapeNet, demonstrate the state-of-the-art performance of the proposed method for accurate point cloud generation.}, bibtype = {article}, author = {Wen, Cheng and Yu, Baosheng and Tao, Dacheng}, doi = {10.1109/CVPR46437.2021.01013}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Image-based 3d object reconstruction: State-of-the-art and trends in the deep learning era}, type = {article}, year = {2021}, keywords = {3D face,3D human body,3D reconstruction,3D video,CNN,LSTM,SLAM,SfM,deep learning,depth estimation}, pages = {1578-1604}, volume = {43}, id = {623d9908-eec3-3466-a73f-3c742baebc92}, created = {2022-10-03T13:31:09.949Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-12-05T14:13:52.460Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {3D reconstruction is a longstanding ill-posed problem, which has been explored for decades by the computer vision, computer graphics, and machine learning communities. Since 2015, image-based 3D reconstruction using convolutional neural networks (CNN) has attracted increasing interest and demonstrated an impressive performance. Given this new era of rapid evolution, this article provides a comprehensive survey of the recent developments in this field. We focus on the works which use deep learning techniques to estimate the 3D shape of generic objects either from a single or multiple RGB images. We organize the literature based on the shape representations, the network architectures, and the training mechanisms they use. While this survey is intended for methods which reconstruct generic objects, we also review some of the recent works which focus on specific object classes such as human body shapes and faces. We provide an analysis and comparison of the performance of some key papers, summarize some of the open problems in this field, and discuss promising directions for future research.}, bibtype = {article}, author = {Han, Xian Feng and Laga, Hamid and Bennamoun, Mohammed}, doi = {10.1109/TPAMI.2019.2954885}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {5} }
@article{ title = {Single image 3D object reconstruction based on deep learning: A review}, type = {article}, year = {2021}, keywords = {3D shape representation,Computer vision,Deep learning,Single image 3D reconstruction}, pages = {463-498}, volume = {80}, publisher = {Multimedia Tools and Applications}, id = {7a923fe2-3fd6-3701-8333-1fed11fef89f}, created = {2022-10-03T13:31:10.088Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:24.321Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {The reconstruction of 3D object from a single image is an important task in the field of computer vision. In recent years, 3D reconstruction of single image using deep learning technology has achieved remarkable results. Traditional methods to reconstruct 3D object from a single image require prior knowledge and assumptions, and the reconstruction object is limited to a certain category or it is difficult to accomplish a good reconstruction from a real image. Although deep learning can solve these problems well with its own powerful learning ability, it also faces many problems. In this paper, we first discuss the challenges faced by applying the deep learning method to reconstruct 3D objects from a single image. Second, we comprehensively review encoders, decoders and training details used in 3D reconstruction of a single image. Then, the common datasets and evaluation metrics of single image 3D object reconstruction in recent years are introduced. In order to analyze the advantages and disadvantages of different 3D reconstruction methods, a series of experiments are used for comparison. In addition, we simply give some related application examples involving 3D reconstruction of a single image. Finally, we summarize this paper and discuss the future directions.}, bibtype = {article}, author = {Fu, Kui and Peng, Jiansheng and He, Qiwen and Zhang, Hanxiao}, doi = {10.1007/s11042-020-09722-8}, journal = {Multimedia Tools and Applications}, number = {1} }
@article{ title = {Leveraging SE(3) Equivariance for Self-Supervised Category-Level Object Pose Estimation}, type = {article}, year = {2021}, pages = {15370-15381}, volume = {19}, id = {b5288764-6542-3a88-8fe4-eec9ee93e6c7}, created = {2023-05-03T13:16:38.933Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.306Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Li2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Category-level object pose estimation aims to find 6D object poses of previously unseen object instances from known categories without access to object CAD models. To reduce the huge amount of pose annotations needed for categorylevel learning, we propose for the first time a self-supervised learning framework to estimate category-level 6D object pose from single 3D point clouds. During training, our method assumes no ground-truth pose annotations, no CAD models, and no multi-view supervision. The key to our method is to disentangle shape and pose through an invariant shape reconstruction module and an equivariant pose estimation module, empowered by SE(3) equivariant point cloud networks. The invariant shape reconstruction module learns to perform aligned reconstructions, yielding a category-level reference frame without using any annotations. In addition, the equivariant pose estimation module achieves category-level pose estimation accuracy that is comparable to some fully supervised methods. Extensive experiments demonstrate the effectiveness of our approach on both complete and partial depth point clouds from the ModelNet40 benchmark, and on real depth point clouds from the NOCS-REAL 275 dataset. The project page with code and visualizations can be found at: dragonlong.github.io/equi-pose.}, bibtype = {article}, author = {Li, Xiaolong and Weng, Yijia and Yi, Li and Guibas, Leonidas and Abbott, A. Lynn and Song, Shuran and Wang, He}, journal = {Advances in Neural Information Processing Systems}, number = {3} }
@article{ title = {Spectral spherical harmonics discrete ordinate method}, type = {article}, year = {2021}, keywords = {Gaussian beam,SHDOM,three-dimensional radiative trasnfer models}, pages = {107386}, volume = {258}, websites = {https://doi.org/10.1016/j.jqsrt.2020.107386}, publisher = {Elsevier Ltd}, id = {8c2d3e9e-070e-3723-b9a5-04774f8d2ffd}, created = {2023-05-03T13:16:38.937Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.237Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Doicu2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {A new method for modeling the radiative transfer in inhomogeneous three-dimensional media illuminated by a Gaussian beam is described. This approach, called the Spectral Spherical Harmonics Discrete Ordinate Method (SSHDOM), uses the Fourier expansion method to transform the three-dimensional radiative transfer into an one-dimensional equation in the spectral domain, and the Spherical Harmonics Discrete Ordinate Method (SHDOM) for its solution. Specifically, (i) the source function is represented in the spectral domain through a spherical harmonic expansion, (ii) the spectral one-dimensional radiative transfer equation is integrated along discrete ordinates through a spatial grid, and (iii) the solution method is based on the Picard iteration. Both SSHDOM and SHDOM algorithms are implemented in a common computer code.}, bibtype = {article}, author = {Doicu, Adrian and Mishchenko, Michael I. and Efremenko, Dmitry S. and Trautmann, Thomas}, doi = {10.1016/j.jqsrt.2020.107386}, journal = {Journal of Quantitative Spectroscopy and Radiative Transfer} }
@article{ title = {Spherical cap harmonic analysis (SCHA) for characterising the morphology of rough surface patches}, type = {article}, year = {2021}, keywords = {Finite element,Fractal surfaces,Microstructures,Self-affine,Spherical cap harmonics,Surface parameterisation}, pages = {837-856}, volume = {393}, websites = {https://doi.org/10.1016/j.powtec.2021.07.081}, publisher = {The Author(s)}, id = {5406b84e-4688-3a64-a94b-c228428f167f}, created = {2023-05-03T13:16:39.062Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.295Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Shaqfa2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We use spherical cap harmonic (SCH) basis functions to analyse and reconstruct the morphology of scanned genus-0 rough surface patches with open edges. We first develop a novel one-to-one conformal mapping algorithm with minimal area distortion for parameterising a surface onto a polar spherical cap with a prescribed half angle. We then show that as a generalisation of the hemispherical harmonic analysis, the SCH analysis provides the most added value for small half angles, i.e., for nominally flat surfaces where the distortion introduced by the parameterisation algorithm is smaller when the surface is projected onto a spherical cap with a small half angle than onto a hemisphere. From the power spectral analysis of the expanded SCH coefficients, we estimate a direction-independent Hurst exponent. We also estimate the wavelengths associated with the orders of the SCH basis functions from the dimensions of the first degree ellipsoidal cap. By windowing the spectral domain, we limit the bandwidth of wavelengths included in a reconstructed surface geometry. This bandlimiting can be used for modifying surfaces, such as for generating finite or discrete element meshes for contact problems. The codes and data developed in this paper are made available under the GNU LGPLv2.1 license.}, bibtype = {article}, author = {Shaqfa, Mahmoud and Choi, Gary P.T. and Beyer, Katrin}, doi = {10.1016/j.powtec.2021.07.081}, journal = {Powder Technology} }
@article{ title = {Steerable 3D Spherical Neurons}, type = {article}, year = {2021}, websites = {http://arxiv.org/abs/2106.13863}, id = {2831707a-3154-3909-a085-0da74f4b8d5a}, created = {2023-05-03T13:16:39.134Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.215Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Melnyk2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Emerging from low-level vision theory, steerable filters found their counterpart in prior work on steerable convolutional neural networks equivariant to rigid transformations. In our work, we propose a steerable feed-forward learning-based approach that consists of neurons with spherical decision surfaces and operates on point clouds. Such spherical neurons are obtained by conformal embedding of Euclidean space and have recently been revisited in the context of learning representations of point sets. Focusing on 3D geometry, we exploit the isometry property of spherical neurons and derive a 3D steerability constraint. After training spherical neurons to classify point clouds in a canonical orientation, we use a tetrahedron basis to quadruplicate the neurons and construct rotation-equivariant spherical filter banks. We then apply the derived constraint to interpolate the filter bank outputs and, thus, obtain a rotation-invariant network. Finally, we use a synthetic point set and real-world 3D skeleton data to verify our theoretical findings.}, bibtype = {article}, author = {Melnyk, Pavlo and Felsberg, Michael and Wadenbäck, Mårten} }
@article{ title = {Three-dimensional reconstruction of realistic stone-based materials with controllable stone inclusion geometries}, type = {article}, year = {2021}, keywords = {DEM simulation,Overlapping detection,Realistic stone shape,Spherical Harmonic,Stochastic model,Stone inclusion,Stone-based material}, pages = {124240}, volume = {305}, websites = {https://doi.org/10.1016/j.conbuildmat.2021.124240}, publisher = {Elsevier Ltd}, id = {f119f9c0-86f3-3135-a86e-eff7bc7576e3}, created = {2023-05-03T13:16:39.255Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.533Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Stone-based materials are heterogeneous construction materials that mainly consist of coarse stones and fine matrices. When using the numerical simulation tools (FEM, DEM, FDEM) to study the mechanical properties of the tone-based materials, the irregular stone-inclusion geometries cannot be neglected. The main objective of this study is to generate three-dimensional models of realistic stone-based materials with controllable stone-inclusion geometries. To achieve this goal, two major methodologies are adopted: (1) the Spherical Harmonic Transform (SHT) method and Inverse Monte-Carlo (IMC) algorithm are employed to randomly generate the 3D particle model of irregular stones with controllable geometries at three different shape scales, including form, roundness and roughness, and (2) based on the Spherical Harmonic function, an overlapping detection algorithm is proposed to facilitate the rapid allocation of the 3D SHT-based particles. The proposed algorithm can quantitatively control several geometrical features of the generated stone-based materials in an efficient and precise manner. Finally, the application of the proposed approach is demonstrated through the discrete modelling of stone-based materials with different stone contents and stone shapes. The proposed study has the significance to pave a viable pathway for stochastic modelling of stone-based materials pertaining to various construction and manufacture processes.}, bibtype = {article}, author = {Wang, Xiang and Yin, Zhen yu and Zhang, Jun qi and Xiong, Hao and Su, Dong}, doi = {10.1016/j.conbuildmat.2021.124240}, journal = {Construction and Building Materials}, number = {January} }
@article{ title = {Geometric Algebra Attention Networks for Small Point Clouds}, type = {article}, year = {2021}, pages = {1-23}, websites = {http://arxiv.org/abs/2110.02393}, id = {08004e17-30c1-3c9a-b927-795ebc1c7caf}, created = {2023-05-03T13:16:39.715Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.182Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Spellings2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Much of the success of deep learning is drawn from building architectures that properly respect underlying symmetry and structure in the data on which they operate - a set of considerations that have been united under the banner of geometric deep learning. Often problems in the physical sciences deal with relatively small sets of points in two- or three-dimensional space wherein translation, rotation, and permutation equivariance are important or even vital for models to be useful in practice. In this work, we present rotation- and permutation-equivariant architectures for deep learning on these small point clouds, composed of a set of products of terms from the geometric algebra and reductions over those products using an attention mechanism. The geometric algebra provides valuable mathematical structure by which to combine vector, scalar, and other types of geometric inputs in a systematic way to account for rotation invariance or covariance, while attention yields a powerful way to impose permutation equivariance. We demonstrate the usefulness of these architectures by training models to solve sample problems relevant to physics, chemistry, and biology.}, bibtype = {article}, author = {Spellings, Matthew} }
@article{ title = {Deep Hierarchical Rotation Invariance Learning with Exact Geometry Feature Representation for Point Cloud Classification}, type = {article}, year = {2021}, pages = {9529-9535}, volume = {2021-May}, publisher = {IEEE}, id = {e6064e13-9333-3ade-b010-4c6162a97cde}, created = {2023-05-03T13:16:40.334Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:15.002Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lin2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Rotation invariance is a crucial property for 3D object classification, which is still a challenging task. State-of-the-art deep learning-based works require a massive amount of data augmentation to tackle this problem. This is however inefficient and classification accuracy suffers a sharp drop in experiments with arbitrary rotations. We introduce a new descriptor that can globally and locally capture the surface geometry properties and is based on a combination of spherical harmonics energy and point feature representation. The proposed descriptor is proven to fulfill the rotation-invariant property. A limited bandwidth spherical harmonics energy descriptor globally describes a 3D shape and its rotation-invariant property is proven by utilizing the properties of a Wigner D-matrix, while the point feature representation captures the local features with a KNN to build the connection to its neighborhood. We propose a new network structure by extending PointNet++ with several adaptations that can hierarchically and efficiently exploit local rotation-invariant features. Extensive experimental results show that our proposed method dramatically outperforms most state-of-the-art approaches on standard rotation-augmented 3D object classification benchmarks as well as in robustness experiments on point perturbation, point density, and partial point clouds.}, bibtype = {article}, author = {Lin, Jianjie and Rickert, Markus and Knoll, Alois}, doi = {10.1109/ICRA48506.2021.9561307}, journal = {Proceedings - IEEE International Conference on Robotics and Automation}, number = {Icra} }
@article{ title = {Spherical Multi-Modal Place Recognition for Heterogeneous Sensor Systems}, type = {article}, year = {2021}, pages = {1743-1750}, volume = {2021-May}, publisher = {IEEE}, id = {11c415c5-b520-3a4b-881d-88da37326322}, created = {2023-05-03T13:16:40.652Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.409Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bernreiter2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In this paper, we propose a robust end-to-end multi-modal pipeline for place recognition where the sensor systems can differ from the map building to the query. Our approach operates directly on images and LiDAR scans without requiring any local feature extraction modules. By projecting the sensor data onto the unit sphere, we learn a multi-modal descriptor of partially overlapping scenes using a spherical convolutional neural network. The employed spherical projection model enables the support of arbitrary LiDAR and camera systems readily without losing information. Loop closure candidates are found using a nearest-neighbor lookup in the embedding space. We tackle the problem of correctly identifying the closest place by correlating the candidates' power spectra, obtaining a confidence value per prospect. Our estimate for the correct place corresponds then to the candidate with the highest confidence. We evaluate our proposal w.r.t. state-of-the-art approaches in place recognition using real-world data acquired using different sensors. Our approach can achieve a recall that is up to 10 % and 5 % higher than for a LiDAR- and vision-based system, respectively, when the sensor setup differs between model training and deployment. Additionally, our place selection can correctly identify up to 95 % matches from the candidate set.}, bibtype = {article}, author = {Bernreiter, Lukas and Ott, Lionel and Nieto, Juan and Siegwart, Roland and Cadena, Cesar}, doi = {10.1109/ICRA48506.2021.9561078}, journal = {Proceedings - IEEE International Conference on Robotics and Automation}, number = {Icra} }
@article{ title = {FAST 3D ACOUSTIC SCATTERING VIA DISCRETE LAPLACIAN BASED IMPLICIT FUNCTION ENCODERS}, type = {article}, year = {2021}, pages = {1-16}, id = {6faadd4d-518e-3e3c-b7d3-8e90aea685fd}, created = {2023-05-03T13:16:40.679Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.432Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Via2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, bibtype = {article}, author = {Via, Cattering} }
@article{ title = {Spherical harmonics for shape-constrained 3d cell segmentation}, type = {article}, year = {2021}, keywords = {3D Segmentation,Shape-Constrain,Spherical Harmonics}, pages = {792-796}, volume = {2021-April}, id = {247366e8-6f7f-382b-a617-c4b25c1fbdbb}, created = {2023-05-03T13:16:40.799Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.517Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Eschweiler2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Recent microscopy imaging techniques allow to precisely analyze cell morphology in 3D image data. To process the vast amount of image data generated by current digitized imaging techniques, automated approaches are demanded more than ever. Segmentation approaches used for morphological analyses, however, are often prone to produce unnaturally shaped predictions, which in conclusion could lead to inaccurate experimental outcomes. In order to minimize further manual interaction, shape priors help to constrain the predictions to the set of natural variations. In this paper, we show how spherical harmonics can be used as an alternative way to inherently constrain the predictions of neural networks for the segmentation of cells in 3D microscopy image data. Benefits and limitations of the spherical harmonic representation are analyzed and final results are compared to other state-of-the-art approaches on two different data sets.}, bibtype = {article}, author = {Eschweiler, Dennis and Rethwisch, Malte and Koppers, Simon and Stegmaier, Johannes}, doi = {10.1109/ISBI48211.2021.9433983}, journal = {Proceedings - International Symposium on Biomedical Imaging}, number = {3} }
@article{ title = {Point-based Acoustic Scattering for Interactive Sound Propagation via Surface Encoding}, type = {article}, year = {2021}, pages = {909-915}, id = {0af586ef-1e5b-33ec-8c01-5cd0392d292e}, created = {2023-05-03T13:16:40.804Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.512Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Meng2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present a novel geometric deep learning method to compute the acoustic scattering properties of geometric objects. Our learning algorithm uses a point cloud representation of objects to compute the scattering properties and integrates them with ray tracing for interactive sound propagation in dynamic scenes. We use discrete Laplacian-based surface encoders and approximate the neighborhood of each point using a shared multilayer perceptron. We show that our formulation is permutation invariant and present a neural network that computes the scattering function using spherical harmonics. Our approach can handle objects with arbitrary topologies and deforming models, and takes less than 1ms per object on a commodity GPU. We have analyzed the accuracy and perform validation on thousands of unseen 3D objects and highlight the benefits over other point-based geometric deep learning methods. To the best of our knowledge, this is the first real-time learning algorithm that can approximate the acoustic scattering properties of arbitrary objects with high accuracy.}, bibtype = {article}, author = {Meng, Hsien Yu and Tang, Zhenyu and Manocha, Dinesh}, doi = {10.24963/ijcai.2021/126}, journal = {IJCAI International Joint Conference on Artificial Intelligence} }
@article{ title = {Learning acoustic scattering fields for dynamic interactive sound propagation}, type = {article}, year = {2021}, keywords = {Computing methodologies-Computer graphics-Graphics}, pages = {835-844}, id = {9801dc07-6577-3746-996f-aa865918d79a}, created = {2023-05-03T13:16:40.826Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.396Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Tang2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present a novel hybrid sound propagation algorithm for interactive applications. Our approach is designed for dynamic scenes and uses a neural network-based learned scattered field representation along with ray tracing to generate specular, diffuse, diffraction, and occlusion effects efficiently. We use geometric deep learning to approximate the acoustic scattering field using spherical harmonics. We use a large 3D dataset for training, and compare its accuracy with the ground truth generated using an accurate wave-based solver. The additional overhead of computing the learned scattered field at runtime is small and we demonstrate its interactive performance by generating plausible sound effects in dynamic scenes with diffraction and occlusion effects. We demonstrate the perceptual benefits of our approach based on an audio-visual user study.}, bibtype = {article}, author = {Tang, Zhenyu and Meng, Hsien Yu and Manocha, Dinesh}, doi = {10.1109/VR50410.2021.00111}, journal = {Proceedings - 2021 IEEE Conference on Virtual Reality and 3D User Interfaces, VR 2021} }
@article{ title = {Survey on the View Planning Problem for Reverse Engineering and Automated Control Applications}, type = {article}, year = {2021}, keywords = {3D reconstruction,Measurement,Object inspection,Optical sensor,Point cloud,View planning}, pages = {103094}, volume = {141}, websites = {https://doi.org/10.1016/j.cad.2021.103094}, publisher = {Elsevier Ltd}, id = {28063d9d-98d5-335d-83c6-4d2c5ffa9491}, created = {2023-07-07T07:53:43.914Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.266Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e4a1ea0f-69ae-4053-94cc-503201fc6c67}, private_publication = {false}, abstract = {At present, optical sensors are being widely used to realize high quality control or reverse engineering of products, systems, buildings, environments or human bodies. Although the intrinsic characteristics of such breakthrough technologies may vary, ensuring complete acquisition relies on the definition of the optimal acquisition planning. To this end, the view planning problem (VPP) must be solved to automatically determine the optimal positions and/or trajectories of the acquisition devices to fully cover the part to be digitized. Such an automatization of the entire acquisition process is of considerably interest in the context of Industry 4.0. The aim of this paper is to review the state of the art works addressing the view planning problem and to identify the future challenges and possible research directions. First, the paper introduces a set of criteria to analyze the available methods, grouped into several macrocategories. The categories are presented and formalized to clearly understand the backbone and similarities of the grouped methods. Second, the paper describes and characterizes the available methods, based on their analysis according to the adopted criteria. The results of this extensive analysis clearly highlight the open issues and future challenges.}, bibtype = {article}, author = {Peuzin-Jubert, Manon and Polette, Arnaud and Nozais, Dominique and Mari, Jean Luc and Pernot, Jean Philippe}, doi = {10.1016/j.cad.2021.103094}, journal = {CAD Computer Aided Design} }
@misc{ title = {Low Power Processors and Image Sensors for Vision-Based IoT Devices: A Review}, type = {misc}, year = {2021}, source = {IEEE Sensors Journal}, keywords = {Internet of Things,image sensor,low power,machine vision,processor}, pages = {1172-1186}, volume = {21}, issue = {2}, month = {1}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {15}, id = {372104ec-fb2d-381f-987a-e34f771ab0ae}, created = {2023-11-07T09:49:41.472Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:12:34.437Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {With the advancements of the Internet of Things (IoT) technology, applications of battery powered machine vision based IoT devices is rapidly growing. While numerous research works are being conducted to develop low power hardware solutions for IoT devices, image capture and image processing remain high power demanding processes leading to a short battery life. However, the power consumption of the machine vision based IoT devices can be minimized by the careful optimization of the hardware components that are used is these devices. In this article, we present a review of low power machine vision hardware components for the IoT applications. A guide to selecting the optimum processors and image sensors for a given battery powered machine vision based IoT device is presented. Next, the factors that must be considered when selecting processors and image sensors for a given IoT application are discussed, and selection criteria for the processors and image sensors are established. Then, the current commercially available hardware components are reviewed in accordance with the established selection criteria. Finally, the research trends in the field of battery powered machine vision based IoT devices are discussed, and the potential future research directions in the field are presented.}, bibtype = {misc}, author = {Maheepala, Malith and Joordens, Matthew A. and Kouzani, Abbas Z.}, doi = {10.1109/JSEN.2020.3015932} }
@article{ title = {electronics Embedded Intelligence on FPGA: Survey, Applications and Challenges}, type = {article}, year = {2021}, keywords = {FPGA,artificial intelligence,deep learning,embedded intelligence,embedded systems,neural networks}, pages = {895}, volume = {10}, websites = {https://doi.org/10.3390/electronics}, id = {e554b3d6-6193-346e-9b35-d72b40f1b16c}, created = {2023-11-07T09:56:35.794Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:13:08.752Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Embedded intelligence (EI) is an emerging research field and has the objective to incorporate machine learning algorithms and intelligent decision-making capabilities into mobile and embedded devices or systems. There are several challenges to be addressed to realize efficient EI implementations in hardware such as the need for: (1) high computational processing; (2) low power consumption (or high energy efficiency); and (3) scalability to accommodate different network sizes and topologies. In recent years, an emerging hardware technology which has demonstrated strong potential and capabilities for EI implementations is the FPGA (field programmable gate array) technology. This paper presents an overview and review of embedded intelligence on FPGA with a focus on applications, platforms and challenges. There are four main classification and thematic descriptors which are reviewed and discussed in this paper for EI: (1) EI techniques including machine learning and neural networks, deep learning, expert systems, fuzzy intelligence, swarm intelligence, self-organizing map (SOM) and extreme learning; (2) applications for EI including object detection and recognition, indoor localization and surveillance monitoring, and other EI applications; (3) hardware and platforms for EI; and (4) challenges for EI. The paper aims to introduce interested researchers to this area and motivate the development of practical FPGA solutions for EI deployment.}, bibtype = {article}, author = {Seng, K P and Lee, P J}, doi = {10.3390/electronics} }
@article{ title = {Revisiting Point Cloud Shape Classification with a Simple and Effective Baseline}, type = {article}, year = {2021}, websites = {https://github.com/}, id = {591937b5-b69d-3fa4-8e4a-52e70623987f}, created = {2024-08-29T12:58:09.643Z}, accessed = {2024-08-29}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-08-29T12:58:38.078Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8b2deb8c-cbcd-4d63-ade5-3dac1fc50168}, private_publication = {false}, abstract = {Processing point cloud data is an important component of many real-world systems. As such, a wide variety of point-based approaches have been proposed, reporting steady benchmark improvements over time. We study the key ingredients of this progress and uncover two critical results. First, we find that auxiliary factors like different evaluation schemes, data augmentation strategies , and loss functions, which are independent of the model architecture, make a large difference in performance. The differences are large enough that they obscure the effect of architecture. When these factors are controlled for, Point-Net++, a relatively older network, performs competitively with recent methods. Second, a very simple projection-based method, which we refer to as SimpleView, performs surprisingly well. It achieves on par or better results than sophisticated state-of-the-art methods on ModelNet40 while being half the size of PointNet++. It also outperforms state-of-the-art methods on ScanOb-jectNN, a real-world point cloud benchmark, and demonstrates better cross-dataset generalization. Code is available at https://github.com/ princeton-vl/SimpleView.}, bibtype = {article}, author = {Goyal, Ankit and Law, Hei and Liu, Bowei and Newell, Alejandro and Deng, Jia} }
@article{ title = {From planes to corners: Multi-purpose primitive detection in unorganized 3D point clouds}, type = {article}, year = {2020}, keywords = {Object detection,computational geometry,range sensing,segmentation and categori-zation}, pages = {1764-1771}, volume = {5}, id = {77cff1a2-473c-3179-8b7a-8ece82645688}, created = {2020-09-14T08:14:53.841Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.020Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Sommer2020}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e,beecb55d-84d0-48a2-a344-e50cfe559467,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {We propose a new method for segmentation-free joint estimation of orthogonal planes, their intersection lines, relationship graph and corners lying at the intersection of three orthogonal planes. Such unified scene exploration under orthogonality allows for multitudes of applications such as semantic plane detection or local and global scan alignment, which in turn can aid robot localization or grasping tasks. Our two-stage pipeline involves a rough yet joint estimation of orthogonal planes followed by a subsequent joint refinement of plane parameters respecting their orthogonality relations. We form a graph of these primitives, paving the way to the extraction of further reliable features: lines and corners. Our experiments demonstrate the validity of our approach in numerous scenarios from wall detection to 6D tracking, both on synthetic and real data.}, bibtype = {article}, author = {Sommer, Christiane and Sun, Yumin and Guibas, Leonidas and Cremers, Daniel and Birdal, Tolga}, doi = {10.1109/LRA.2020.2969936}, journal = {IEEE Robotics and Automation Letters}, number = {2} }
@article{ title = {Geometric Primitives in LiDAR Point Clouds: A Review}, type = {article}, year = {2020}, keywords = {Edges,geometric primitives,light detection and ranging (lidar),lines,planes,point clouds,regularization,skeletons,volumetric shapes}, pages = {685-707}, volume = {13}, publisher = {IEEE}, id = {29bc7c93-0a3a-3cb5-8f58-d7368cfa1cc7}, created = {2020-09-14T08:14:53.877Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T14:53:38.134Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e,07e07de9-bcac-4934-a82b-d0aff540e56d}, private_publication = {false}, abstract = {To the best of our knowledge, the most recent light detection and ranging (lidar)-based surveys have been focused only on specific applications such as reconstruction and segmentation, as well as data processing techniques based on a specific platform, e.g., mobile laser. However, in this article, lidar point clouds are understood from a new and universal perspective, i.e., geometric primitives embedded in versatile objects in the physical world. In lidar point clouds, the basic unit is the point coordinate. Geometric primitives that consist of a group of discrete points may be viewed as one kind of abstraction and representation of lidar data at the entity level. We categorize geometric primitives into two classes: Shape primitives, e.g., lines, surfaces, and volumetric shapes, and structure primitives, represented by skeletons and edges. In recent years, many efforts from different communities, such as photogrammetry, computer vision, and computer graphics, have been made to finalize geometric primitive detection, regularization, and in-depth applications. Interpretations of geometric primitives from multiple disciplines try to convey the significance of geometric primitives, the latest processing techniques regarding geometric primitives, and their potential possibilities in the context of lidar point clouds. To this end, primitive-based applications are reviewed with an emphasis on object extraction and reconstruction to clearly show the significances of this article. Next, we survey and compare methods for geometric primitive extraction and then review primitive regularization methods that add real-world constrains to initial primitives. Finally, we summarize the challenges, expected applications, and describe possible future for primitive extraction methods that can achieve globally optimal results efficiently, even with disorganized, uneven, noisy, incomplete, and large-scale lidar point clouds.}, bibtype = {article}, author = {Xia, Shaobo and Chen, Dong and Wang, Ruisheng and Li, Jonathan and Zhang, Xinchang}, doi = {10.1109/JSTARS.2020.2969119}, journal = {IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing} }
@article{ title = {A review on deep learning approaches for 3d data representations in retrieval and classifications}, type = {article}, year = {2020}, keywords = {3D data representation,3D deep learning,3D models dataset,Classification,Computer vision,Retrieval}, pages = {57566-57593}, volume = {8}, id = {666cc992-7b98-3881-842a-bab78b9176c6}, created = {2020-09-14T08:14:53.886Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-31T07:21:16.792Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Gezawa2020}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Deep learning approach has been used extensively in image analysis tasks. However, implementing the methods in 3D data is a bit complex because most of the previously designed deep learning architectures used 1D or 2D as input. In this work, the performance of deep learning methods on different 3D data representations has been reviewed. Based on the categorization of the different 3D data representations proposed in this paper, the importance of choosing a suitable 3D data representation which depends on simplicity, usability, and efficiency has been highlighted. Furthermore, the origin and contents of the major 3D datasets were discussed in detail. Due to growing interest in 3D object retrieval and classification tasks, the performance of different 3D object retrieval and classification on ModelNet40 dataset were compared. According to the findings in this work, multi views methods surpass voxel-based methods and with increased layers and enough data augmentation the performance can still be increased. Therefore, it can be concluded that deep learning together with a suitable 3D data representation gives an effective approach for improving the performance of 3D shape analysis. Finally, some possible directions for future researches were suggested.}, bibtype = {article}, author = {Gezawa, Abubakar Sulaiman and Zhang, Yan and Wang, Qicong and Yunqi, Lei}, doi = {10.1109/ACCESS.2020.2982196}, journal = {IEEE Access} }
@article{ title = {CNN-Based Lidar Point Cloud De-Noising in Adverse Weather}, type = {article}, year = {2020}, keywords = {Semantic scene understanding,computer vision for transportation,visual learning}, pages = {2514-2521}, volume = {5}, id = {4eb6e6f2-883b-3eaa-8d2c-3bfb29487798}, created = {2020-10-01T13:48:35.420Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-01T13:48:55.548Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {Lidar sensors are frequently used in environment perception for autonomous vehicles and mobile robotics to complement camera, radar, and ultrasonic sensors. Adverse weather conditions are significantly impacting the performance of lidar-based scene understanding by causing undesired measurement points that in turn effect missing detections and false positives. In heavy rain or dense fog, water drops could be misinterpreted as objects in front of the vehicle which brings a mobile robot to a full stop. In this letter, we present the first CNN-based approach to understand and filter out such adverse weather effects in point cloud data. Using a large data set obtained in controlled weather environments, we demonstrate a significant performance improvement of our method over state-of-the-art involving geometric filtering. Data is available at https://github.com/rheinzler/PointCloudDeNoising.}, bibtype = {article}, author = {Heinzler, Robin and Piewak, Florian and Schindler, Philipp and Stork, Wilhelm}, doi = {10.1109/LRA.2020.2972865}, journal = {IEEE Robotics and Automation Letters}, number = {2} }
@article{ title = {Supervised learning of the next-best-view for 3d object reconstruction}, type = {article}, year = {2020}, keywords = {3D reconstruction,3D-CNN,Next-best-view}, pages = {224-231}, volume = {133}, websites = {https://doi.org/10.1016/j.patrec.2020.02.024}, publisher = {Elsevier B.V.}, id = {9ce275d2-18aa-3157-994c-d5dc9e2a7ab2}, created = {2020-10-05T10:26:00.758Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:39.858Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mendoza2020}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,bc26f4dd-ccfc-4a52-b602-2ceb657d0906,07e07de9-bcac-4934-a82b-d0aff540e56d,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Motivated by the advances in 3D sensing technology and the spreading of low-cost robotic platforms, 3D object reconstruction has become a common task in many areas. Nevertheless, the selection of the optimal sensor pose that maximizes the reconstructed surface is a problem that remains open. It is known in the literature as the next-best-view planning problem. In this paper, we propose a novel next-best-view planning scheme based on supervised deep learning. The scheme contains an algorithm for automatic generation of datasets and an original three-dimensional convolutional neural network (3D-CNN) used to learn the next-best-view. Unlike previous work where the problem is addressed as a search, the trained 3D-CNN directly predicts the sensor pose. We present an experimental comparison of the proposed architecture against two alternative networks; we also compare it with state-of-the-art next-best-view methods in the reconstruction of several unknown objects. Our method is faster and reaches high coverage.}, bibtype = {article}, author = {Mendoza, Miguel and Vasquez-Gomez, J. Irving and Taud, Hind and Sucar, L. Enrique and Reta, Carolina}, doi = {10.1016/j.patrec.2020.02.024}, journal = {Pattern Recognition Letters} }
@article{ title = {PointContrast: Unsupervised Pre-training for 3D Point Cloud Understanding}, type = {article}, year = {2020}, keywords = {3d scene understanding,point cloud recognition,repre-,sentation learning,unsupervised learning}, websites = {http://arxiv.org/abs/2007.10985}, id = {d77a7559-98f3-3789-b3ac-6e00a4f76049}, created = {2020-10-15T09:39:12.560Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T08:25:31.061Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Arguably one of the top success stories of deep learning is transfer learning. The finding that pre-training a network on a rich source set (eg., ImageNet) can help boost performance once fine-tuned on a usually much smaller target set, has been instrumental to many applications in language and vision. Yet, very little is known about its usefulness in 3D point cloud understanding. We see this as an opportunity considering the effort required for annotating data in 3D. In this work, we aim at facilitating research on 3D representation learning. Different from previous works, we focus on high-level scene understanding tasks. To this end, we select a suite of diverse datasets and tasks to measure the effect of unsupervised pre-training on a large source set of 3D scenes. Our findings are extremely encouraging: using a unified triplet of architecture, source dataset, and contrastive loss for pre-training, we achieve improvement over recent best results in segmentation and detection across 6 different benchmarks for indoor and outdoor, real and synthetic datasets -- demonstrating that the learned representation can generalize across domains. Furthermore, the improvement was similar to supervised pre-training, suggesting that future efforts should favor scaling data collection over more detailed annotation. We hope these findings will encourage more research on unsupervised pretext task design for 3D deep learning.}, bibtype = {article}, author = {Xie, Saining and Gu, Jiatao and Guo, Demi and Qi, Charles R. and Guibas, Leonidas J. and Litany, Or} }
@article{ title = {A survey on indoor RGB-D semantic segmentation : from hand-crafted features to deep convolutional neural networks}, type = {article}, year = {2020}, keywords = {RGB-Depth images,Semantic segmentation,Deep learni,deep learning,hand-crafted,rgb-depth images,semantic segmentation}, pages = {4499-4524}, publisher = {Multimedia Tools and Applications}, id = {d1ad6612-e4fa-3f0d-b57a-ae0ff4641411}, created = {2020-10-20T09:48:06.186Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-20T14:11:56.184Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Fooladgar2020}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c,5cd4d7ce-c2fb-4e91-ab80-35deeb123df5}, private_publication = {false}, bibtype = {article}, author = {Fooladgar, Fahimeh and Kasaei, Shohreh} }
@article{ title = {Uncertainty-Aware CNNs for Depth Completion: Uncertainty from Beginning to End}, type = {article}, year = {2020}, pages = {12011-12020}, id = {acce76bf-829c-3c11-8681-ca1bdacb39ef}, created = {2020-10-22T05:44:10.553Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-22T05:44:15.535Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {The focus in deep learning research has been mostly to push the limits of prediction accuracy. However, this was often achieved at the cost of increased complexity, raising concerns about the interpretability and the reliability of deep networks. Recently, an increasing attention has been given to untangling the complexity of deep networks and quantifying their uncertainty for different computer vision tasks. Differently, the task of depth completion has not received enough attention despite the inherent noisy nature of depth sensors. In this work, we thus focus on modeling the uncertainty of depth data in depth completion starting from the sparse noisy input all the way to the final prediction. We propose a novel approach to identify disturbed measurements in the input by learning an input confidence estimator in a self-supervised manner based on the normalized convolutional neural networks (NCNNs). Further, we propose a probabilistic version of NCNNs that produces a statistically meaningful uncertainty measure for the final prediction. When we evaluate our approach on the KITTI dataset for depth completion, we outperform all the existing Bayesian Deep Learning approaches in terms of prediction accuracy, quality of the uncertainty measure, and the computational efficiency. Moreover, our small network with 670k parameters performs on-par with conventional approaches with millions of parameters. These results give strong evidence that separating the network into parallel uncertainty and prediction streams leads to state-of-the-art performance with accurate uncertainty estimates.}, bibtype = {article}, author = {Eldesokey, Abdelrahman and Felsberg, Michael and Holmquist, Karl and Persson, Michael}, doi = {10.1109/cvpr42600.2020.01203} }
@article{ title = {Normal Estimation for 3D Point Clouds via Local Plane Constraint and Multi-scale Selection}, type = {article}, year = {2020}, keywords = {LPFC,Multi-scale selection,Normal estimation,Plane-aware features,Point cloud processing,Robust to noise}, volume = {129}, id = {f2300a44-7d4e-33ca-b450-9035b4872354}, created = {2020-11-03T13:16:20.066Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T14:09:30.067Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhou2020}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper, we propose a normal estimation method for unstructured 3D point clouds. In this method, a feature constraint mechanism called Local Plane Features Constraint (LPFC) is used and then a multi-scale selection strategy is introduced. The LPFC can be used in a single-scale point network architecture for a more stable normal estimation of the unstructured 3D point clouds. In particular, it can partly overcome the influence of noise on a large sampling scale compared to the other methods which only use regression loss for normal estimation. For more details, a subnetwork is built after point-wise features extracted layers of the network and it gives more constraints to each point of the local patch via a binary classifier in the end. Then we use multi-task optimization to train the normal estimation and local plane classification tasks simultaneously. Via LPFC, the normal estimation network could obtain more distinguish point-wise plane-aware features that can describe the differences of each point on the local patch. Finally, thanks to the distinguish features constraint, we can obtain a more robust and meaningful global feature that can be used to regress the normal of the local patch. Also, to integrate the advantages of multi-scale results, a scale selection strategy is adopted, which is a data-driven approach for selecting the optimal scale around each point and encourages subnetwork specialization. Specifically, we employed a subnetwork called Scale Estimation Network to extract scale weight information from multi-scale features. The multi-scale method can well reduce the cost time while persevere the estimation accuracy. More analysis is given about the relations between noise levels, local boundary, and scales in the experiment. These relationships can be a better guide to choosing particular scales for a particular model. Besides, the experimental result shows that our network can distinguish the points on the fitting plane accurately and this can be used to guide the normal estimation and our multi-scale method can improve the results well. Compared to some state-of-the-art surface normal estimators, our method is robust to noise and can achieve competitive results.}, bibtype = {article}, author = {Zhou, Jun and Huang, Hua and Liu, Bin and Liu, Xiuping}, doi = {10.1016/j.cad.2020.102916}, journal = {CAD Computer Aided Design} }
@article{ title = {PointCleanNet: Learning to Denoise and Remove Outliers from Dense Point Clouds}, type = {article}, year = {2020}, keywords = {Neural networks,Shape analysis,[Computing Methodologies]: Point-based models,methods and applications,modeling,point-based graphics,point-based methods,signal processing}, pages = {185-203}, volume = {39}, id = {8d867188-3ef7-3b02-b746-ef3fa43dbd45}, created = {2020-11-03T13:16:20.067Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-19T07:55:50.086Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rakotosaona2020}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Point clouds obtained with 3D scanners or by image-based reconstruction techniques are often corrupted with significant amount of noise and outliers. Traditional methods for point cloud denoising largely rely on local surface fitting (e.g. jets or MLS surfaces), local or non-local averaging or on statistical assumptions about the underlying noise model. In contrast, we develop a simple data-driven method for removing outliers and reducing noise in unordered point clouds. We base our approach on a deep learning architecture adapted from PCPNet, which was recently proposed for estimating local 3D shape properties in point clouds. Our method first classifies and discards outlier samples, and then estimates correction vectors that project noisy points onto the original clean surfaces. The approach is efficient and robust to varying amounts of noise and outliers, while being able to handle large densely sampled point clouds. In our extensive evaluation, both on synthetic and real data, we show an increased robustness to strong noise levels compared to various state-of-the-art methods, enabling accurate surface reconstruction from extremely noisy real data obtained by range scans. Finally, the simplicity and universality of our approach makes it very easy to integrate in any existing geometry processing pipeline. Both the code and pre-trained networks can be found on the project page (https://github.com/mrakotosaon/pointcleannet).}, bibtype = {article}, author = {Rakotosaona, Marie Julie and La Barbera, Vittorio and Guerrero, Paul and Mitra, Niloy J. and Ovsjanikov, Maks}, doi = {10.1111/cgf.13753}, journal = {Computer Graphics Forum}, number = {1} }
@article{ title = {Point Cloud Normal Estimation by Fast Guided Least Squares Representation}, type = {article}, year = {2020}, keywords = {Normal estimation,fast algorithm,feature preserving,least squares representation}, pages = {101580-101590}, volume = {8}, id = {231a8f25-4b70-3993-8913-2848f1456fcf}, created = {2020-11-03T13:16:20.072Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-03T13:17:39.505Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Normal estimation is an essential task for scanned point clouds in various CAD/CAM applications. The method (GLSRNE) based on guided least squares representation (GLSR) balances speed with quality well among state-of-the-art methods. First, it segments each neighborhood into multiple sub-neighborhoods. For some neighborhoods, the segmentation is obtained by GLSR which is an efficient subspace segmentation model and widely applied in other applications. The segmentation of the rest neighborhoods is inferred via the subspace structure propagation (SSP) algorithm. Then, each sub-neighborhood is fitted by a plane. The plane achieving the minimum distance with the current point is selected for the final normal estimation. We make improvements for effectiveness and efficiency in the following three aspects. First, to improve the speed of GLSR, we propose a novel iterative algorithm to reduce the computation complexity from $O(n^3)$ to $O(n^2)$ with its convergence guaranteed theoretically, where $n$ represents the number of the data points. Moreover, this proposed algorithm will also be useful for other applications. Second, we add a normal constraint for SSP to improve accuracy. Third, when selecting one plane to estimate the final normal, we consider the match between the plane and all neighbors, whereas GLSRNE only considers the match between the plane and the current point. The experiments exhibit that our method is faster than GLSRNE and more effective than GLSRNE and other state-of-the-art methods.}, bibtype = {article}, author = {Zhang, Jie and Duan, Jiahui and Tang, Kewei and Cao, Junjie and Liu, Xiuping}, doi = {10.1109/ACCESS.2020.2998468}, journal = {IEEE Access} }
@article{ title = {Confidence Estimation for ToF and Stereo Sensors and Its Application to Depth Data Fusion}, type = {article}, year = {2020}, keywords = {Time-of-flight,confidence information,data fusion,deep learning,stereo vision}, pages = {1411-1421}, volume = {20}, id = {7c7ca9cd-dcc3-3c52-aff0-57a6b4c53bf6}, created = {2020-11-05T09:10:48.240Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-11T06:38:13.097Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Time-of-Flight (ToF) sensors and stereo vision systems are two widely used technologies for depth estimation. Due to their rather complementary strengths and limitations, the two sensors are often combined to infer more accurate depth maps. A key research issue in this field is how to estimate the reliability of the sensed depth data. While this problem has been widely studied for stereo systems, it has been seldom considered for ToF sensors. Therefore, starting from the work done for stereo data, in this paper, we firstly introduce novel confidence estimation techniques for ToF data. Moreover, we also show how by using learning-based confidence metrics jointly trained on the two sensors yields better performance. Finally, deploying different fusion frameworks, we show how confidence estimation can be exploited in order to guide the fusion of depth data from the two sensors. Experimental results show how accurate confidence cues allow outperforming state-of-the-art data fusion schemes even with the simplest fusion strategies known in the literature.}, bibtype = {article}, author = {Poggi, Matteo and Agresti, Gianluca and Tosi, Fabio and Zanuttigh, Pietro and Mattoccia, Stefano}, doi = {10.1109/JSEN.2019.2946591}, journal = {IEEE Sensors Journal}, number = {3} }
@article{ title = {Learning to segment 3D point clouds in 2D image space}, type = {article}, year = {2020}, pages = {12252-12261}, id = {11eaa7ce-3478-3eb6-a788-ce0ed8f23361}, created = {2020-11-13T11:34:36.717Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:36:34.913Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In contrast to the literature where local patterns in 3D point clouds are captured by customized convolutional operators, in this paper we study the problem of how to effectively and efficiently project such point clouds into a 2D image space so that traditional 2D convolutional neural networks (CNNs) such as U-Net can be applied for segmentation. To this end, we are motivated by graph drawing and reformulate it as an integer programming problem to learn the topology-preserving graph-to-grid mapping for each individual point cloud. To accelerate the computation in practice, we further propose a novel hierarchical approximate algorithm. With the help of the Delaunay triangulation for graph construction from point clouds and a multi-scale U-Net for segmentation, we manage to demonstrate the state-of-the-art performance on ShapeNet and PartNet, respectively, with significant improvement over the literature. Code is available at https://github.com/Zhang-VISLab.}, bibtype = {article}, author = {Lyu, Yecheng and Huang, Xinming and Zhang, Ziming}, doi = {10.1109/CVPR42600.2020.01227}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Normal assisted stereo depth estimation}, type = {article}, year = {2020}, pages = {2186-2196}, id = {c525dc05-ec38-3875-913d-c1480d7676e5}, created = {2020-11-13T11:34:36.721Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:36:04.070Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Accurate stereo depth estimation plays a critical role in various 3D tasks in both indoor and outdoor environments. Recently, learning-based multi-view stereo methods have demonstrated competitive performance with limited number of views. However, in challenging scenarios, especially when building cross-view correspondences is hard, these methods still cannot produce satisfying results. In this paper, we study how to leverage a normal estimation model and the predicted normal maps to improve the depth quality. We couple the learning of a multi-view normal estimation module and a multi-view depth estimation module. In addition, we propose a novel consistency loss to train an independent consistency module that refines the depths from depth/normal pairs. We find that the joint learning can improve both the prediction of normal and depth, and the accuracy & smoothness can be further improved by enforcing the consistency. Experiments on MVS, SUN3D, RGBD and Scenes11 demonstrate the effectiveness of our method and state-of-the-art performance.}, bibtype = {article}, author = {Kusupati, Uday and Cheng, Shuo and Chen, Rui and Su, Hao}, doi = {10.1109/CVPR42600.2020.00226}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Neighbourhood-Insensitive Point Cloud Normal Estimation Network}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2008.09965}, id = {242d02a0-b8f0-37df-a222-caad9bb43942}, created = {2020-11-13T11:34:37.182Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:34:58.643Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We introduce a novel self-attention-based normal estimation network that is able to focus softly on relevant points and adjust the softness by learning a temperature parameter, making it able to work naturally and effectively within a large neighbourhood range. As a result, our model outperforms all existing normal estimation algorithms by a large margin, achieving 94.1% accuracy in comparison with the previous state of the art of 91.2%, with a 25x smaller model and 12x faster inference time. We also use point-to-plane Iterative Closest Point (ICP) as an application case to show that our normal estimations lead to faster convergence than normal estimations from other methods, without manually fine-tuning neighbourhood range parameters. Code available at https://code.active.vision.}, bibtype = {article}, author = {Wang, Zirui and Prisacariu, Victor Adrian} }
@article{ title = {Deep learning based multi-modal fusion architectures for maritime vessel detection}, type = {article}, year = {2020}, keywords = {Autonomous vehicles,Convolutional neural networks,Deep learning,Marine environment,Multi-sensor fusion,Object detection}, volume = {12}, id = {124db671-ca53-3b85-9ed8-6450ffb6eac2}, created = {2020-11-16T11:56:20.677Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:17.832Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f3937f49-b3bb-4fad-a335-4fb2125beca8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Object detection is a fundamental computer vision task for many real-world applications. In the maritime environment, this task is challenging due to varying light, view distances, weather conditions, and sea waves. In addition, light reflection, camera motion and illumination changes may cause to false detections. To address this challenge, we present three fusion architectures to fuse two imaging modalities: visible and infrared. These architectures can provide complementary information from two modalities in different levels: pixel-level, feature-level, and decision-level. They employed deep learning for performing fusion and detection. We investigate the performance of the proposed architectures conducting a real marine image dataset, which is captured by color and infrared cameras on-board a vessel in the Finnish archipelago. The cameras are employed for developing autonomous ships, and collect data in a range of operation and climatic conditions. Experiments show that feature-level fusion architecture outperforms the state-of-the-art other fusion level architectures.}, bibtype = {article}, author = {Farahnakian, Fahimeh and Heikkonen, Jukka}, doi = {10.3390/RS12162509}, journal = {Remote Sensing}, number = {16} }
@article{ title = {CNN based Color and Thermal Image Fusion for Object Detection in CNN based Color and Thermal Image Fusion for Object Detection in Automated Driving}, type = {article}, year = {2020}, id = {03d96ec2-97ae-3b73-808f-d42784f579f5}, created = {2020-11-16T11:56:20.702Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:17.247Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f3937f49-b3bb-4fad-a335-4fb2125beca8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, bibtype = {article}, author = {Yadav, Ravi and Samir, Ahmed and Rashed, Hazem and Yogamani, Senthil and Dahyot, Rozenn}, number = {July} }
@article{ title = {From Planes to Corners : Multi-Purpose Primitive Detection in Unorganized 3D Point Clouds}, type = {article}, year = {2020}, pages = {1-8}, id = {b058cafd-237b-3011-b69c-b63be4a8238a}, created = {2020-12-11T09:32:46.150Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-12-11T09:33:36.131Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, bibtype = {article}, author = {Sommer, Christiane and Guibas, Leonidas and Cremers, Daniel}, number = {c} }
@article{ title = {Geometric Primitives in LiDAR Point Clouds: A Review}, type = {article}, year = {2020}, keywords = {Edges,geometric primitives,light detection and ranging (lidar),lines,planes,point clouds,regularization,skeletons,volumetric shapes}, pages = {685-707}, volume = {13}, id = {33654dff-2635-33a7-bc70-929efd158d04}, created = {2021-01-25T14:53:33.640Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:54.920Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,beecb55d-84d0-48a2-a344-e50cfe559467}, private_publication = {false}, abstract = {To the best of our knowledge, the most recent light detection and ranging (lidar)-based surveys have been focused only on specific applications such as reconstruction and segmentation, as well as data processing techniques based on a specific platform, e.g., mobile laser. However, in this article, lidar point clouds are understood from a new and universal perspective, i.e., geometric primitives embedded in versatile objects in the physical world. In lidar point clouds, the basic unit is the point coordinate. Geometric primitives that consist of a group of discrete points may be viewed as one kind of abstraction and representation of lidar data at the entity level. We categorize geometric primitives into two classes: Shape primitives, e.g., lines, surfaces, and volumetric shapes, and structure primitives, represented by skeletons and edges. In recent years, many efforts from different communities, such as photogrammetry, computer vision, and computer graphics, have been made to finalize geometric primitive detection, regularization, and in-depth applications. Interpretations of geometric primitives from multiple disciplines try to convey the significance of geometric primitives, the latest processing techniques regarding geometric primitives, and their potential possibilities in the context of lidar point clouds. To this end, primitive-based applications are reviewed with an emphasis on object extraction and reconstruction to clearly show the significances of this article. Next, we survey and compare methods for geometric primitive extraction and then review primitive regularization methods that add real-world constrains to initial primitives. Finally, we summarize the challenges, expected applications, and describe possible future for primitive extraction methods that can achieve globally optimal results efficiently, even with disorganized, uneven, noisy, incomplete, and large-scale lidar point clouds.}, bibtype = {article}, author = {Xia, Shaobo and Chen, Dong and Wang, Ruisheng and Li, Jonathan and Zhang, Xinchang}, doi = {10.1109/JSTARS.2020.2969119}, journal = {IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing} }
@article{ title = {PrimiTect: Fast Continuous Hough Voting for Primitive Detection}, type = {article}, year = {2020}, pages = {8404-8410}, id = {1a5c069d-f0f3-3c09-9103-f8fe7865df16}, created = {2021-01-25T14:53:33.924Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:55.079Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,beecb55d-84d0-48a2-a344-e50cfe559467}, private_publication = {false}, abstract = {This paper tackles the problem of data abstraction in the context of 3D point sets. Our method classifies points into different geometric primitives, such as planes and cones, leading to a compact representation of the data. Being based on a semi-global Hough voting scheme, the method does not need initialization and is robust, accurate, and efficient. We use a local, low-dimensional parameterization of primitives to determine type, shape and pose of the object that a point belongs to. This makes our algorithm suitable to run on devices with low computational power, as often required in robotics applications. The evaluation shows that our method outperforms state-of-the-art methods both in terms of accuracy and robustness.}, bibtype = {article}, author = {Sommer, Christiane and Sun, Yumin and Bylow, Erik and Cremers, Daniel}, doi = {10.1109/ICRA40945.2020.9196988}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {View planning in robot active vision: A survey of systems, algorithms, and applications}, type = {article}, year = {2020}, keywords = {active vision,next-best view,robotic,sensor planning,view planning}, pages = {225-245}, volume = {6}, id = {a9005365-b168-3b3c-8a9a-63b1f0f699bc}, created = {2021-01-26T12:37:11.551Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.280Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zeng2020a}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,5439d198-93d5-4603-a7ce-201d423f231e,bc26f4dd-ccfc-4a52-b602-2ceb657d0906,13d43b82-d9b4-40a8-9031-8e926a718ef0,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Rapid development of artificial intelligence motivates researchers to expand the capabilities of intelligent and autonomous robots. In many robotic applications, robots are required to make planning decisions based on perceptual information to achieve diverse goals in an efficient and effective way. The planning problem has been investigated in active robot vision, in which a robot analyzes its environment and its own state in order to move sensors to obtain more useful information under certain constraints. View planning, which aims to find the best view sequence for a sensor, is one of the most challenging issues in active robot vision. The quality and efficiency of view planning are critical for many robot systems and are influenced by the nature of their tasks, hardware conditions, scanning states, and planning strategies. In this paper, we first summarize some basic concepts of active robot vision, and then review representative work on systems, algorithms and applications from four perspectives: object reconstruction, scene reconstruction, object recognition, and pose estimation. Finally, some potential directions are outlined for future work.}, bibtype = {article}, author = {Zeng, Rui and Wen, Yuhui and Zhao, Wang and Liu, Yong Jin}, doi = {10.1007/s41095-020-0179-3}, journal = {Computational Visual Media}, number = {3} }
@article{ title = {Linking Points with Labels in 3D: A Review of Point Cloud Semantic Segmentation}, type = {article}, year = {2020}, pages = {38-59}, volume = {8}, id = {b585ee86-54de-36e3-a316-ce6b923457e2}, created = {2021-01-27T10:14:14.218Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:00:50.846Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Ripe with possibilities offered by deep-learning techniques and useful in applications related to remote sensing, computer vision, and robotics, 3D point cloud semantic segmentation (PCSS) and point cloud segmentation (PCS) are attracting increasing interest. This article summarizes available data sets and relevant studies on recent developments in PCSS and PCS.}, bibtype = {article}, author = {Xie, Yuxing and Tian, Jiaojiao and Zhu, Xiao Xiang}, doi = {10.1109/MGRS.2019.2937630}, journal = {IEEE Geoscience and Remote Sensing Magazine}, number = {4} }
@article{ title = {3D Object Recognition and Pose Estimation from Point Cloud Using Stably Observed Point Pair Feature}, type = {article}, year = {2020}, keywords = {3D object recognition,3D pose estimation,Point cloud,Point pair feature}, volume = {8}, id = {6435ba87-6d9a-301d-8223-844a9d586866}, created = {2021-01-28T07:55:30.109Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-29T05:05:54.879Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,471f331d-8560-4b9e-b910-e5f849b6fcfd}, private_publication = {false}, abstract = {Recognition and pose estimation from 3D free-form objects is a key step for autonomous robotic manipulation. Recently, the point pair features (PPF) voting approach has been shown to be effective for simultaneous object recognition and pose estimation. However, the global model descriptor (e.g., PPF and its variants) that contained some unnecessary point pair features decreases the recognition performance and increases computational efficiency. To address this issue, in this paper, we introduce a novel strategy for building a global model descriptor using stably observed point pairs. The stably observed point pairs are calculated from the partial view point clouds which are rendered by the virtual camera from various viewpoints. The global model descriptor is extracted from the stably observed point pairs and then stored in a hash table. Experiments on several datasets show that our proposed method reduces redundant point pair features and achieves better compromise of speed vs accuracy.}, bibtype = {article}, author = {Li, Deping and Wang, Hanyun and Liu, Ning and Wang, Xiaoming and Xu, Jin}, doi = {10.1109/ACCESS.2020.2978255}, journal = {IEEE Access} }
@article{ title = {Next-best-view Regression using a 3D Convolutional Neural Network}, type = {article}, year = {2020}, id = {5e6593fa-f28d-3733-817e-09077c38c395}, created = {2021-01-29T13:17:54.316Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.142Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {David2020}, folder_uuids = {bc26f4dd-ccfc-4a52-b602-2ceb657d0906,4f36a0a5-b08a-4f70-b020-4daf83cb0507,471f331d-8560-4b9e-b910-e5f849b6fcfd}, private_publication = {false}, bibtype = {article}, author = {David, J Irving Vasquez-gomez and Israel, Troncoso and Enrique, Becerra and Jan, C V}, journal = {Arxiv} }
@article{ title = {Multi-Sensor Next-Best-View Planning as}, type = {article}, year = {2020}, pages = {5323-5330}, volume = {5}, id = {ec39c9cc-ecac-3f1b-a20b-311ce2b7ea68}, created = {2021-02-09T08:36:10.688Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T06:55:52.228Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc26f4dd-ccfc-4a52-b602-2ceb657d0906}, private_publication = {false}, bibtype = {article}, author = {Maximization, Matroid-constrained Submodular and Lauri, Mikko and Pajarinen, Joni and Peters, Jan and Frintrop, Simone}, number = {4} }
@article{ title = {PC-NBV : A Point Cloud Based Deep Network for Efficient Next Best View Planning PC-NBV}, type = {article}, year = {2020}, pages = {7050-7057}, id = {8528bcb5-36e9-3a06-9bfa-51073f5ac245}, created = {2021-02-09T08:36:10.707Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:39.876Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zeng2020}, folder_uuids = {bc26f4dd-ccfc-4a52-b602-2ceb657d0906,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, bibtype = {article}, author = {Zeng, Rui and Zhao, Wang and Liu, Yong-jin}, journal = {IEEE/RSJ International Conference on Intelligent Robots and Systems} }
@article{ title = {ActiveMoCap: Optimized viewpoint selection for active human motion capture}, type = {article}, year = {2020}, pages = {100-109}, id = {369b152f-5b77-3fbe-8ad2-032c240c5edc}, created = {2021-02-09T08:36:10.712Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:23.322Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc26f4dd-ccfc-4a52-b602-2ceb657d0906}, private_publication = {false}, abstract = {The accuracy of monocular 3D human pose estimation depends on the viewpoint from which the image is captured. While freely moving cameras, such as on drones, provide control over this viewpoint, automatically positioning them at the location which will yield the highest accuracy remains an open problem. This is the problem that we address in this paper. Specifically, given a short video sequence, we introduce an algorithm that predicts which viewpoints should be chosen to capture future frames so as to maximize 3D human pose estimation accuracy. The key idea underlying our approach is a method to estimate the uncertainty of the 3D body pose estimates. We integrate several sources of uncertainty, originating from deep learning based regressors and temporal smoothness. Our motion planner yields improved 3D body pose estimates and outperforms or matches existing ones that are based on person following and orbiting.}, bibtype = {article}, author = {Kiciroglu, Sena and Rhodin, Helge and Sinha, Sudipta N. and Salzmann, Mathieu and Fua, Pascal}, doi = {10.1109/CVPR42600.2020.00018}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {PLAUSIBLE RECONSTRUCTION of AN APPROXIMATED MESH MODEL for NEXT-BEST VIEW PLANNING of SFM-MVS}, type = {article}, year = {2020}, keywords = {Multi-View Stereo,Next-bestview,Quality prediction,Structure from Motion,Surface reconstruction,View planning}, pages = {465-471}, volume = {43}, id = {29a9e041-f09b-383b-acd1-97da600f0dcb}, created = {2021-02-09T08:36:10.727Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:31.957Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc26f4dd-ccfc-4a52-b602-2ceb657d0906}, private_publication = {false}, abstract = {Structure-from-Motion (SfM) and Multi-View Stereo (MVS) are widely used methods in three dimensional (3D) model reconstruction for an infrastructure maintenance purpose. However, if a set of images is not captured from well-placed positions, the final dense model can contain low-quality regions. Since MVS requires a much longer processing time than SfM as larger amounts of images are provided, it is impossible for surveyors to wait for the SfM-MVS process to complete and evaluate the geometric quality of a final dense model on-site. This challenge results in response inefficiency and the deterioration of dense models in 3D model reconstruction. If the quality of the final dense model can be predicted immediately after SfM, it will be possible to revalidate the images much earlier and to obtain the dense model with better quality than the existing SfM-MVS process. Therefore, we propose a method for reconstructing a more plausible 3D mesh model that accurately approximates the geometry of the final dense model only from sparse point clouds generated from SfM. This approximated mesh model can be generated using Delaunay triangulation for the sparse point clouds and triangle as well as tetrahedron filtering. The approximated model can be used to predict the geometric quality of the final dense model and for an optimization-based view planning. Some experimental results showed that our method is effective in predicting the quality of the final dense model and finding the potentially degraded regions. Moreover, it was confirmed that the average reconstruction errors of the dense model generated by the optimization-based view planning went below tens of millimeters and falls within an acceptable range for an infrastructure maintenance purpose.}, bibtype = {article}, author = {Moritani, R. and Kanai, S. and Date, H. and Niina, Y. and Honma, R.}, doi = {10.5194/isprs-archives-XLIII-B2-2020-465-2020}, journal = {International Archives of the Photogrammetry, Remote Sensing and Spatial Information Sciences - ISPRS Archives}, number = {B2} }
@article{ title = {Recognition and grasping of disorderly stacked wood planks using a local image patch and point pair feature method}, type = {article}, year = {2020}, keywords = {Convolutional auto-encoders,Local image patch,Plank recognition,Point pair feature,Robotic grasping}, pages = {1-18}, volume = {20}, id = {e7fc3d22-f8ce-3cfe-b786-19452a00b328}, created = {2021-02-09T17:05:46.735Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:37.625Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {beecb55d-84d0-48a2-a344-e50cfe559467}, private_publication = {false}, abstract = {Considering the difficult problem of robot recognition and grasping in the scenario of disorderly stacked wooden planks, a recognition and positioning method based on local image features and point pair geometric features is proposed here and we define a local patch point pair feature. First, we used self-developed scanning equipment to collect images of wood boards and a robot to drive a RGB-D camera to collect images of disorderly stacked wooden planks. The image patches cut from these images were input to a convolutional autoencoder to train and obtain a local texture feature descriptor that is robust to changes in perspective. Then, the small image patches around the point pairs of the plank model are extracted, and input into the trained encoder to obtain the feature vector of the image patch, combining the point pair geometric feature information to form a feature description code expressing the characteristics of the plank. After that, the robot drives the RGB-D camera to collect the local image patches of the point pairs in the area to be grasped in the scene of the stacked wooden planks, also obtaining the feature description code of the wooden planks to be grasped. Finally, through the process of point pair feature matching, pose voting and clustering, the pose of the plank to be grasped is determined. The robot grasping experiment here shows that both the recognition rate and grasping success rate of planks are high, reaching 95.3% and 93.8%, respectively. Compared with the traditional point pair feature method (PPF) and other methods, the method present here has obvious advantages and can be applied to stacked wood plank grasping environments.}, bibtype = {article}, author = {Xu, Chengyi and Liu, Ying and Ding, Fenglong and Zhuang, Zilong}, doi = {10.3390/s20216235}, journal = {Sensors (Switzerland)}, number = {21} }
@article{ title = {Geometric modelling for 3d point clouds of elbow joints in piping systems}, type = {article}, year = {2020}, keywords = {Elbow joints,Geometric model,Laser scanning,Point cloud registration}, pages = {1-18}, volume = {20}, id = {c5234e5a-ffa0-30a7-ab04-4dc6402523e5}, created = {2021-02-09T17:05:47.283Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:25.664Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {beecb55d-84d0-48a2-a344-e50cfe559467}, private_publication = {false}, abstract = {Pipe elbow joints exist in almost every piping system supporting many important applications such as clean water supply. However, spatial information of the elbow joints is rarely extracted and analyzed from observations such as point cloud data obtained from laser scanning due to lack of a complete geometric model that can be applied to different types of joints. In this paper, we proposed a novel geometric model and several model adaptions for typical elbow joints including the 90◦ and 45◦ types, which facilitates the use of 3D point clouds of the elbow joints collected from laser scanning. The model comprises translational, rotational, and dimensional parameters, which can be used not only for monitoring the joints’ geometry but also other applications such as point cloud registrations. Both simulated and real datasets were used to verify the model, and two applications derived from the proposed model (point cloud registration and mounting bracket detection) were shown. The results of the geometric fitting of the simulated datasets suggest that the model can accurately recover the geometry of the joint with very low translational (0.3 mm) and rotational (0.064◦) errors when ±0.02 m random errors were introduced to coordinates of a simulated 90◦ joint (with diameter equal to 0.2 m). The fitting of the real datasets suggests that the accuracy of the diameter estimate reaches 97.2%. The joint-based registration accuracy reaches sub-decimeter and sub-degree levels for the translational and rotational parameters, respectively.}, bibtype = {article}, author = {Chan, Ting On and Xia, Linyuan and Lichti, Derek D. and Sun, Yeran and Wang, Jun and Jiang, Tao and Li, Qianxia}, doi = {10.3390/s20164594}, journal = {Sensors (Switzerland)}, number = {16} }
@article{ title = {PC-NBV : A Point Cloud Based Deep Network for Efficient Next Best View Planning PC-NBV}, type = {article}, year = {2020}, pages = {7050-7057}, id = {36ef1b3d-b7e2-37ed-ac16-37b40d9e9be3}, created = {2021-02-15T14:14:03.681Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-15T11:20:02.365Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zeng2020}, folder_uuids = {2f2b519d-56f0-4e04-b335-d8e25f087073}, private_publication = {false}, bibtype = {article}, author = {Zeng, Rui and Zhao, Wang and Liu, Yong-jin}, journal = {IEEE/RSJ International Conference on Intelligent Robots and Systems} }
@article{ title = {Geometry and learning co-supported normal estimation for unstructured point cloud}, type = {article}, year = {2020}, pages = {13235-13244}, id = {9c80914f-5f81-36a9-b4a2-b3d1f2493695}, created = {2021-02-24T11:29:14.411Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:24.441Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper, we propose a normal estimation method for unstructured point cloud. We observe that geometric estimators commonly focus more on feature preservation but are hard to tune parameters and sensitive to noise, while learning-based approaches pursue an overall normal estimation accuracy but cannot well handle challenging regions such as surface edges. This paper presents a novel normal estimation method, under the co-support of geometric estimator and deep learning. To lowering the learning difficulty, we first propose to compute a suboptimal initial normal at each point by searching for a best fitting patch. Based on the computed normal field, we design a normal-based height map network (NH-Net) to fine-tune the suboptimal normals. Qualitative and quantitative evaluations demonstrate the clear improvements of our results over both traditional methods and learning-based methods, in terms of estimation accuracy and feature recovery.}, bibtype = {article}, author = {Zhou, Haoran and Chen, Honghua and Feng, Yidan and Wang, Qiong and Qin, Jing and Xie, Haoran and Wang, Fu Lee and Wei, Mingqiang and Wang, Jun}, doi = {10.1109/CVPR42600.2020.01325}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Deep feature-preserving normal estimation for point cloud filtering}, type = {article}, year = {2020}, keywords = {Deep learning,Feature preserving,Normal estimation,Point cloud filtering}, id = {167bcfc3-ffa3-363e-a325-769a144232ee}, created = {2021-02-24T11:29:14.416Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:22.775Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Point cloud filtering, the main bottleneck of which is removing noise (outliers) while preserving geometric features, is a fundamental problem in 3D field. The two-step schemes involving normal estimation and position update have been shown to produce promising results. Nevertheless, the current normal estimation methods including optimization ones and deep learning ones, often either have limited automation or cannot preserve sharp features. In this paper, we propose a novel feature-preserving normal estimation method for point cloud filtering with preserving geometric features. It is a learning method and thus achieves automatic prediction for normals. For training phase, we first generate patch based samples which are then fed to a classification network to classify feature and nonfeature points. We finally train the samples of feature and non-feature points separately, to achieve decent results. Regarding testing, given a noisy point cloud, its normals can be automatically estimated. For further point cloud filtering, we iterate the above normal estimation and a current position update algorithm for a few times. Various experiments demonstrate that our method outperforms state-of-the-art normal estimation methods and point cloud filtering techniques, in terms of both quality and quantity.}, bibtype = {article}, author = {Lu, Dening and Lu, Xuequan and Sun, Yangxing and Wang, Jun}, journal = {arXiv} }
@article{ title = {DeepURL: Deep pose estimation framework for underwater relative localization}, type = {article}, year = {2020}, id = {b7d3b85e-3da4-3521-a4a2-94da3fe5684b}, created = {2021-02-24T11:29:14.436Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T14:09:38.453Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper, we propose a real-time deep-learning approach for determining the 6D relative pose of Autonomous Underwater Vehicles (AUV) from a single image. A team of autonomous robots localizing themselves, in a communication-constrained underwater environment, is essential for many applications such as underwater exploration, mapping, multi-robot convoying, and other multi-robot tasks. Due to the profound difficulty of collecting ground truth images with accurate 6D poses underwater, this work utilizes rendered images from the Unreal Game Engine simulation for training. An image translation network is employed to bridge the gap between the rendered and the real images producing synthetic images for training. The proposed method predicts the 6D pose of an AUV from a single image as 2D image keypoints representing 8 corners of the 3D model of the AUV, and then the 6D pose in the camera coordinates is determined using RANSAC-based PnP. Experimental results in underwater environments (swimming pool and ocean) with different cameras demonstrate the robustness of the proposed technique, where the trained system decreased translation error by 75.5% and orientation error by 64.6% over the state-of-the-art methods.}, bibtype = {article}, author = {Joshi, Bharat and Modasshir, Md and Manderson, Travis and Damron, Hunter and Xanthidis, Marios and Li, Alberto Quattrini and Rekleitis, Ioannis and Dudek, Gregory}, journal = {arXiv} }
@article{ title = {PointRCNN: 3D Object Proposal Generation and Detection From Point Cloud}, type = {article}, year = {2020}, pages = {770-779}, id = {406be931-ad43-3a97-ae74-b5583f293ff1}, created = {2021-03-04T15:41:23.800Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-17T08:32:39.139Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Shi2020}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {In this paper, we propose PointRCNN for 3D object detection from raw point cloud. The whole framework is composed of two stages: stage-1 for the bottom-up 3D proposal generation and stage-2 for refining proposals in the canonical coordinates to obtain the final detection results. Instead of generating proposals from RGB image or projecting point cloud to bird's view or voxels as previous methods do, our stage-1 sub-network directly generates a small number of high-quality 3D proposals from point cloud in a bottom-up manner via segmenting the point cloud of the whole scene into foreground points and background. The stage-2 sub-network transforms the pooled points of each proposal to canonical coordinates to learn better local spatial features, which is combined with global semantic features of each point learned in stage-1 for accurate box refinement and confidence prediction. Extensive experiments on the 3D detection benchmark of KITTI dataset show that our proposed architecture outperforms state-of-the-art methods with remarkable margins by using only point cloud as input. The code is available at https://github.com/sshaoshuai/PointRCNN.}, bibtype = {article}, author = {Shi, Shaoshuai and Wang, Xiaogang and Li, Hongsheng}, doi = {10.1109/cvpr.2019.00086} }
@article{ title = {A two-phase cross-modality fusion network for robust 3D object detection}, type = {article}, year = {2020}, keywords = {3D object detection,Cross-modality fusion,Deep convolutional neural networks}, pages = {1-14}, volume = {20}, id = {b252eb7a-2c0b-36c1-82ac-cf06a7e16369}, created = {2021-03-04T15:41:24.007Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-17T08:32:39.173Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jiao2020}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {A two-phase cross-modality fusion detector is proposed in this study for robust and high-precision 3D object detection with RGB images and LiDAR point clouds. First, a two-stream fusion network is built into the framework of Faster RCNN to perform accurate and robust 2D detection. The visible stream takes the RGB images as inputs, while the intensity stream is fed with the intensity maps which are generated by projecting the reflection intensity of point clouds to the front view. A multi-layer feature-level fusion scheme is designed to merge multi-modal features across multiple layers in order to enhance the expressiveness and robustness of the produced features upon which region proposals are generated. Second, a decision-level fusion is implemented by projecting 2D proposals to the space of the point cloud to generate 3D frustums, on the basis of which the second-phase 3D detector is built to accomplish instance segmentation and 3D-box regression on the filtered point cloud. The results on the KITTI benchmark show that features extracted from RGB images and intensity maps complement each other, and our proposed detector achieves state-of-the-art performance on 3D object detection with a substantially lower running time as compared to available competitors.}, bibtype = {article}, author = {Jiao, Yujun and Yin, Zhishuai}, doi = {10.3390/s20216043}, journal = {Sensors (Switzerland)}, number = {21} }
@article{ title = {CNN-Based Lidar Point Cloud De-Noising in Adverse Weather}, type = {article}, year = {2020}, keywords = {Semantic scene understanding,computer vision for transportation,visual learning}, pages = {2514-2521}, volume = {5}, id = {81fc6d1c-5884-36a9-8267-abfac5135813}, created = {2021-03-08T09:43:03.984Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-19T07:57:50.848Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Heinzler2020}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Lidar sensors are frequently used in environment perception for autonomous vehicles and mobile robotics to complement camera, radar, and ultrasonic sensors. Adverse weather conditions are significantly impacting the performance of lidar-based scene understanding by causing undesired measurement points that in turn effect missing detections and false positives. In heavy rain or dense fog, water drops could be misinterpreted as objects in front of the vehicle which brings a mobile robot to a full stop. In this letter, we present the first CNN-based approach to understand and filter out such adverse weather effects in point cloud data. Using a large data set obtained in controlled weather environments, we demonstrate a significant performance improvement of our method over state-of-the-art involving geometric filtering. Data is available at https://github.com/rheinzler/PointCloudDeNoising.}, bibtype = {article}, author = {Heinzler, Robin and Piewak, Florian and Schindler, Philipp and Stork, Wilhelm}, doi = {10.1109/LRA.2020.2972865}, journal = {IEEE Robotics and Automation Letters}, number = {2} }
@article{ title = {Fast and Accurate Desnowing Algorithm for LiDAR Point Clouds}, type = {article}, year = {2020}, keywords = {LiDAR point cloud filtering,Snow noise removal,autonomous vehicle,desnowing}, pages = {160202-160212}, volume = {8}, id = {1d300016-d84f-34de-8656-a3b632796484}, created = {2021-03-08T09:43:04.293Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-19T07:57:50.869Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Park2020}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {LiDAR sensors have the advantage of being able to generate high-resolution imaging quickly during both day and night; however, their performance is severely limited in adverse weather conditions such as snow, rain, and dense fog. Consequently, many researchers are actively working to overcome these limitations by applying sensor fusion with radar and optical cameras to LiDAR. While studies on the denoising of point clouds acquired by LiDAR in adverse weather have been conducted recently, the results are still insufficient for application to autonomous vehicles because of speed and accuracy performance limitations. Therefore, we propose a new intensity-based filter that differs from the existing distance-based filter, which limits the speed. The proposed method showed overwhelming performance advantages in terms of both speed and accuracy by removing only snow particles while leaving important environmental features. The intensity criteria for snow removal were derived based on an analysis of the properties of laser light and snow particles.}, bibtype = {article}, author = {Park, Ji Il and Park, Jihyuk and Kim, Kyung Soo}, doi = {10.1109/ACCESS.2020.3020266}, journal = {IEEE Access} }
@article{ title = {Learning Graph-Convolutional Representations for Point Cloud Denoising}, type = {article}, year = {2020}, keywords = {Denoising,Graph neural network,Point cloud}, pages = {103-118}, volume = {12365 LNCS}, id = {8de11456-63d7-366d-a1b1-8c955e286066}, created = {2021-03-21T16:37:52.116Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-21T16:37:52.531Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Point clouds are an increasingly relevant data type but they are often corrupted by noise. We propose a deep neural network based on graph-convolutional layers that can elegantly deal with the permutation-invariance problem encountered by learning-based point cloud processing methods. The network is fully-convolutional and can build complex hierarchies of features by dynamically constructing neighborhood graphs from similarity among the high-dimensional feature representations of the points. When coupled with a loss promoting proximity to the ideal surface, the proposed approach significantly outperforms state-of-the-art methods on a variety of metrics. In particular, it is able to improve in terms of Chamfer measure and of quality of the surface normals that can be estimated from the denoised data. We also show that it is especially robust both at high noise levels and in presence of structured noise such as the one encountered in real LiDAR scans.}, bibtype = {article}, author = {Pistilli, Francesca and Fracastoro, Giulia and Valsesia, Diego and Magli, Enrico}, doi = {10.1007/978-3-030-58565-5_7}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Differentiable Manifold Reconstruction for Point Cloud Denoising}, type = {article}, year = {2020}, keywords = {Denoising,Differentiable pooling,Manifold,Point clouds}, id = {d7b634c9-4b02-378c-86f3-40a01bc6d797}, created = {2021-03-21T16:37:52.195Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-21T16:37:52.942Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {3D point clouds are often perturbed by noise due to the inherent limitation of acquisition equipments, which obstructs downstream tasks such as surface reconstruction, rendering and so on. Previous works mostly infer the displacement of noisy points from the underlying surface, which however are not designated to recover the surface explicitly and may lead to sub-optimal denoising results. To this end, we propose to learn the underlying manifold of a noisy point cloud from differentiably subsampled points with trivial noise perturbation and their embedded neighborhood feature, aiming to capture intrinsic structures in point clouds. Specifically, we present an autoencoder-like neural network. The encoder learns both local and non-local feature representations of each point, and then samples points with low noise via an adaptive differentiable pooling operation. Afterwards, the decoder infers the underlying manifold by transforming each sampled point along with the embedded feature of its neighborhood to a local surface centered around the point. By resampling on the reconstructed manifold, we obtain a denoised point cloud. Further, we design an unsupervised training loss, so that our network can be trained in either an unsupervised or supervised fashion. Experiments show that our method significantly outperforms state-of-the-art denoising methods under both synthetic noise and real world noise. The code and data are available at https://github.com/luost26/DMRDenoise.}, bibtype = {article}, author = {Luo, Shitong and Hu, Wei}, doi = {10.1145/3394171.3413727}, journal = {arXiv} }
@article{ title = {Geometric adversarial attacks and defenses on 3D point clouds}, type = {article}, year = {2020}, id = {d8855eb5-7220-36a5-8424-5247bbb6aaf7}, created = {2021-04-15T14:18:52.482Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T14:19:02.556Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Deep neural networks are prone to adversarial examples that maliciously alter the network’s outcome. Due to the increasing popularity of 3D sensors in safety-critical systems and the vast deployment of deep learning models for 3D point sets, there is a growing interest in adversarial attacks and defenses for such models. So far, the research has focused on the semantic level, namely, deep point cloud classifiers. However, point clouds are also widely used in a geometric-related form that includes encoding and reconstructing the geometry. In this work, we explore adversarial examples at a geometric level. That is, a small change to a clean source point cloud leads, after passing through an autoencoder model, to a shape from a different target class. On the defense side, we show that remnants of the attack’s target shape are still present at the reconstructed output after applying the defense to the adversarial input. Our code is publicly available.}, bibtype = {article}, author = {Lang, Itai and Kotlicki, Uriel and Avidan, Shai}, journal = {arXiv} }
@article{ title = {MaskNet: A Fully-Convolutional Network to Estimate Inlier Points}, type = {article}, year = {2020}, keywords = {3D Perception,Deep Learning,Inlier Estimation,Occlusions,Outlier Estimation,Partial Point Cloud Registration,Partial Point Clouds,Point Cloud Denoising,Point Cloud Registration,PointNet}, pages = {1029-1038}, id = {c3071854-e950-31e7-a777-4aee77946517}, created = {2021-04-15T14:18:52.518Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-26T07:46:58.667Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Point clouds have grown in importance in the way computers perceive the world. From LIDAR sensors in autonomous cars and drones to the time of flight and stereo vision systems in our phones, point clouds are everywhere. Despite their ubiquity, point clouds in the real world are often missing points because of sensor limitations or occlusions, or contain extraneous points from sensor noise or artifacts. These problems challenge algorithms that require computing correspondences between a pair of point clouds. Therefore, this paper presents a fully-convolutional neural network that identifies which points in one point cloud are most similar (inliers) to the points in another. We show improvements in learning-based and classical point cloud registration approaches when retrofitted with our network. We demonstrate these improvements on synthetic and real-world datasets. Finally, our network produces impressive results on test datasets that were unseen during training, thus exhibiting generalizability. Code and videos are available at https://github.com/vinits5/masknet.}, bibtype = {article}, author = {Sarode, Vinit and Dhagat, Animesh and Srivatsan, Rangaprasad Arun and Zevallos, Nicolas and Lucey, Simon and Choset, Howie}, doi = {10.1109/3DV50981.2020.00113}, journal = {Proceedings - 2020 International Conference on 3D Vision, 3DV 2020} }
@article{ title = {Next Best View Planning via Reinforcement Learning for Scanning of Arbitrary 3D Shapes}, type = {article}, year = {2020}, keywords = {3D model,CAD model,depth map,mesh,next best view,reinforcement learning}, pages = {1484-1490}, volume = {65}, id = {e043226a-2546-349f-96b0-7f24c29b2b6f}, created = {2021-05-17T06:52:28.254Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:32.156Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc26f4dd-ccfc-4a52-b602-2ceb657d0906}, private_publication = {false}, abstract = {Abstract: Reconstructing 3D objects from scanned measurements is a fundamental task in computer vision. A central factor for the effectiveness of 3D reconstruction is the selection of sensor views for scanning. The latter remains an open problem in the 3D geometry processing area, known as the next-best-view planning problem, and is commonly approached by combinatorial or greedy methods. In this work, we propose a reinforcement learning-based approach to sequential next-best-view planning. The method is implemented based on the gym environment including 3D reconstruction, next-best-scan planning, and image acquisition features. We demonstrate this method to outperform the baselines in terms of the number of required scans and the obtained 3D mesh reconstruction accuracy.}, bibtype = {article}, author = {Potapova, S. G. and Artemov, A. V. and Sviridov, S. V. and Musatkina, D. A. and Zorin, D. N. and Burnaev, E. V.}, doi = {10.1134/S1064226920120141}, journal = {Journal of Communications Technology and Electronics}, number = {12} }
@article{ title = {Very power efficient neural time-of-flight}, type = {article}, year = {2020}, pages = {2246-2255}, id = {805cf671-b58b-39ff-9d3c-0fa1ed500dfb}, created = {2021-06-21T08:44:26.193Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-21T08:44:50.049Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Time-of-Flight (ToF) cameras require active illumination to obtain depth information thus the power of illumination directly affects the performance of ToF cameras. Traditional ToF imaging algorithms are very sensitive to illumination and the depth accuracy degenerates rapidly with the power of it. Therefore, the design of a power efficient ToF camera always creates a painful dilemma for the illumination and the performance trade-off. In this paper, we show that despite the weak signals in many areas under extreme short exposure setting, these signals as a whole can be well utilized through a learning process which directly translates the weak and noisy ToF camera raw to depth map. This creates an opportunity to tackle the aforementioned dilemma and make a very power efficient ToF camera possible. To enable the learning, we collect a comprehensive dataset under a variety of scenes and photographic conditions by a specialized ToF camera. Experiments show that our method is able to robustly process ToF camera raw with the exposure time of one order of magnitude shorter than that used in conventional ToF cameras. In addition to evaluating our approach both quantitatively and qualitatively, we also discuss its implication to designing the next generation power efficient ToF cameras.}, bibtype = {article}, author = {Chen, Yan and Ren, Jimmy and Cheng, Xuanye and Qian, Keyuan and Wang, Luyang and Gu, Jinwei}, doi = {10.1109/WACV45572.2020.9093594}, journal = {Proceedings - 2020 IEEE Winter Conference on Applications of Computer Vision, WACV 2020} }
@article{ title = {Deep Learning for Anomaly Detection}, type = {article}, year = {2020}, keywords = {anomaly detection,deep learning}, pages = {3569-3570}, id = {96e58fcc-8c8a-37d2-8696-f2184f69d866}, created = {2021-06-21T08:44:26.193Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-21T08:44:50.970Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Anomaly detection has been widely studied and used in diverse applications. Building an effective anomaly detection system requires researchers and developers to learn complex structure from noisy data, identify dynamic anomaly patterns, and detect anomalies with limited labels. Recent advancements in deep learning techniques have greatly improved anomaly detection performance, in comparison with classical approaches, and have extended anomaly detection to a wide variety of applications. This tutorial will help the audience gain a comprehensive understanding of deep learning based anomaly detection techniques in various application domains. First, we give an overview of the anomaly detection problem, introducing the approaches taken before the deep model era and listing out the challenges they faced. Then we survey the state-of-the-art deep learning models that range from building block neural network structures such as MLP, CNN, and LSTM, to more complex structures such as autoencoder, generative models (VAE, GAN, Flow-based models), to deep one-class detection models, etc. In addition, we illustrate how techniques such as transfer learning and reinforcement learning can help amend the label sparsity issue in anomaly detection problems and how to collect and make the best use of user labels in practice. Second to last, we discuss real world use cases coming from and outside LinkedIn. The tutorial concludes with a discussion of future trends.}, bibtype = {article}, author = {Wang, Ruoying and Nie, Kexin and Chang, Yen Jung and Gong, Xinwei and Wang, Tie and Yang, Yang and Long, Bo}, doi = {10.1145/3394486.3406481}, journal = {Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, number = {February} }
@article{ title = {Point Transformer}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2012.09164}, id = {a946f993-291c-3eab-a62f-cb1b09101843}, created = {2021-06-21T08:44:26.199Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:18.546Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhao2020}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7,597192a3-7679-4832-a554-980990d8ac9b,d54ba66b-a8cf-41de-8e2d-c3256f322e07,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Self-attention networks have revolutionized natural language processing and are making impressive strides in image analysis tasks such as image classification and object detection. Inspired by this success, we investigate the application of self-attention networks to 3D point cloud processing. We design self-attention layers for point clouds and use these to construct self-attention networks for tasks such as semantic scene segmentation, object part segmentation, and object classification. Our Point Transformer design improves upon prior work across domains and tasks. For example, on the challenging S3DIS dataset for large-scale semantic scene segmentation, the Point Transformer attains an mIoU of 70.4% on Area 5, outperforming the strongest prior model by 3.3 absolute percentage points and crossing the 70% mIoU threshold for the first time.}, bibtype = {article}, author = {Zhao, Hengshuang and Jiang, Li and Jia, Jiaya and Torr, Philip and Koltun, Vladlen} }
@article{ title = {Occupancy Anticipation for Efficient Exploration and Navigation}, type = {article}, year = {2020}, pages = {400-418}, volume = {12350 LNCS}, id = {5b3f4631-1e58-3ef3-b821-2c1f1960f6ab}, created = {2021-06-21T08:44:26.369Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-21T08:44:48.440Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {State-of-the-art navigation methods leverage a spatial memory to generalize to new environments, but their occupancy maps are limited to capturing the geometric structures directly observed by the agent. We propose occupancy anticipation, where the agent uses its egocentric RGB-D observations to infer the occupancy state beyond the visible regions. In doing so, the agent builds its spatial awareness more rapidly, which facilitates efficient exploration and navigation in 3D environments. By exploiting context in both the egocentric views and top-down maps our model successfully anticipates a broader map of the environment, with performance significantly better than strong baselines. Furthermore, when deployed for the sequential decision-making tasks of exploration and navigation, our model outperforms state-of-the-art methods on the Gibson and Matterport3D datasets. Our approach is the winning entry in the 2020 Habitat PointNav Challenge. Project page: http://vision.cs.utexas.edu/projects/occupancy_anticipation/.}, bibtype = {article}, author = {Ramakrishnan, Santhosh K. and Al-Halah, Ziad and Grauman, Kristen}, doi = {10.1007/978-3-030-58558-7_24}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {CNN based Road User Detection using the 3D Radar Cube}, type = {article}, year = {2020}, keywords = {Object detection,deep learning in robotics and automation,segmentation and categorization,sensor fusion}, pages = {1263-1270}, volume = {5}, websites = {http://arxiv.org/abs/2004.12165,http://dx.doi.org/10.1109/LRA.2020.2967272}, month = {4}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {25}, id = {0e9fe074-7859-3a9c-be3d-322e0c10eb9b}, created = {2021-06-22T09:46:12.423Z}, accessed = {2021-06-22}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-22T09:46:16.689Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {This letter presents a novel radar based, single-frame, multi-class detection method for moving road users (pedestrian, cyclist, car), which utilizes low-level radar cube data. The method provides class information both on the radar target- and object-level. Radar targets are classified individually after extending the target features with a cropped block of the 3D radar cube around their positions, thereby capturing the motion of moving parts in the local velocity distribution. A Convolutional Neural Network (CNN) is proposed for this classification step. Afterwards, object proposals are generated with a clustering step, which not only considers the radar targets' positions and velocities, but their calculated class scores as well. In experiments on a real-life dataset we demonstrate that our method outperforms the state-of-the-art methods both target- and object-wise by reaching an average of 0.70 (baseline: 0.68) target-wise and 0.56 (baseline: 0.48) object-wise F1 score. Furthermore, we examine the importance of the used features in an ablation study.}, bibtype = {article}, author = {Palffy, Andras and Dong, Jiaao and Kooij, Julian F. P. and Gavrila, Dariu M.}, doi = {10.1109/LRA.2020.2967272}, journal = {IEEE Robotics and Automation Letters}, number = {2} }
@article{ title = {DeepFit: 3D Surface Fitting via Neural Network Weighted Least Squares}, type = {article}, year = {2020}, keywords = {3D point cloud deep learning,Least squares,Normal estimation,Surface fitting,Unstructured 3D point clouds}, pages = {20-34}, volume = {12346 LNCS}, id = {c0553302-2b57-3eb5-bba2-c9242bea74bd}, created = {2021-07-02T08:12:34.572Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T09:25:32.610Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We propose a surface fitting method for unstructured 3D point clouds. This method, called DeepFit, incorporates a neural network to learn point-wise weights for weighted least squares polynomial surface fitting. The learned weights act as a soft selection for the neighborhood of surface points thus avoiding the scale selection required of previous methods. To train the network we propose a novel surface consistency loss that improves point weight estimation. The method enables extracting normal vectors and other geometrical properties, such as principal curvatures, the latter were not presented as ground truth during training. We achieve state-of-the-art results on a benchmark normal and curvature estimation dataset, demonstrate robustness to noise, outliers and density variations, and show its application on noise removal.}, bibtype = {article}, author = {Ben-Shabat, Yizhak and Gould, Stephen}, doi = {10.1007/978-3-030-58452-8_2}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {DeepFit: 3D Surface Fitting via Neural Network Weighted Least Squares}, type = {article}, year = {2020}, keywords = {3D point cloud deep learning,Least squares,Normal estimation,Surface fitting,Unstructured 3D point clouds}, pages = {20-34}, volume = {12346 LNCS}, id = {30fb53ba-efc0-379e-af4c-0cc03429f270}, created = {2021-07-05T12:25:37.361Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T08:58:16.271Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ben-Shabat2020}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We propose a surface fitting method for unstructured 3D point clouds. This method, called DeepFit, incorporates a neural network to learn point-wise weights for weighted least squares polynomial surface fitting. The learned weights act as a soft selection for the neighborhood of surface points thus avoiding the scale selection required of previous methods. To train the network we propose a novel surface consistency loss that improves point weight estimation. The method enables extracting normal vectors and other geometrical properties, such as principal curvatures, the latter were not presented as ground truth during training. We achieve state-of-the-art results on a benchmark normal and curvature estimation dataset, demonstrate robustness to noise, outliers and density variations, and show its application on noise removal.}, bibtype = {article}, author = {Ben-Shabat, Yizhak and Gould, Stephen}, doi = {10.1007/978-3-030-58452-8_2}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Deep Iterative Surface Normal Estimation}, type = {article}, year = {2020}, pages = {11244-11253}, id = {67101ee7-953f-3242-ad50-c4891e0d8ddb}, created = {2021-07-05T12:25:37.362Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T05:58:00.349Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lenssen2020}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {This paper presents an end-to-end differentiable algorithm for robust and detail-preserving surface normal estimation on unstructured point-clouds. We utilize graph neural networks to iteratively parameterize an adaptive anisotropic kernel that produces point weights for weighted least-squares plane fitting in local neighborhoods. The approach retains the interpretability and efficiency of traditional sequential plane fitting while benefiting from adaptation to data set statistics through deep learning. This results in a state-of-the-art surface normal estimator that is robust to noise, outliers and point density variation, preserves sharp features through anisotropic kernels and equivariance through a local quaternion-based spatial transformer. Contrary to previous deep learning methods, the proposed approach does not require any hand-crafted features or preprocessing. It improves on the state-of-the-art results while being more than two orders of magnitude faster and more parameter efficient.}, bibtype = {article}, author = {Lenssen, Jan Eric and Osendorfer, Christian and Masci, Jonathan}, doi = {10.1109/CVPR42600.2020.01126}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Graph Networks with Spectral Message Passing}, type = {article}, year = {2020}, id = {be2e6317-9b5e-374e-bd66-8f22ed255afd}, created = {2021-07-12T09:25:31.914Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T09:25:52.378Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {article}, author = {Stachenfeld, Kimberly L and Godwin, Jonathan and Battaglia, Peter} }
@article{ title = {Principal Neighbourhood Aggregation for Graph Nets}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2004.05718}, id = {f7c636c9-1fd8-3cdd-9ff7-bf8432c73e9b}, created = {2021-07-12T10:19:36.476Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:20:06.824Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs) have been shown to be effective models for different predictive tasks on graph-structured data. Recent work on their expressive power has focused on isomorphism tasks and countable feature spaces. We extend this theoretical framework to include continuous features—which occur regularly in real-world input domains and within the hidden layers of GNNs—and we demonstrate the requirement for multiple aggregation functions in this context. Accordingly, we propose Principal Neighbourhood Aggregation (PNA), a novel architecture combining multiple aggregators with degree-scalers (which generalize the sum aggregator). Finally, we compare the capacity of different models to capture and exploit the graph structure via a novel benchmark containing multiple tasks taken from classical graph theory, alongside existing benchmarks from real-world domains, all of which demonstrate the strength of our model. With this work, we hope to steer some of the GNN research towards new aggregation methods which we believe are essential in the search for powerful and robust models.}, bibtype = {article}, author = {Corso, Gabriele and Cavalleri, Luca and Beaini, Dominique and Liò, Pietro and Veličković, Petar}, number = {NeurIPS} }
@article{ title = {Differentiable Graph Module (DGM) for Graph Convolutional Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2002.04999}, id = {a9ddc59c-8bf7-3541-adaf-a0ea5eaeac00}, created = {2021-07-12T10:19:36.611Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:54.045Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Graph deep learning has recently emerged as a powerful ML concept allowing to generalize successful deep neural architectures to non-Euclidean structured data. Such methods have shown promising results on a broad spectrum of applications ranging from social science, biomedicine, and particle physics to computer vision, graphics, and chemistry. One of the limitations of the majority of the current graph neural network architectures is that they are often restricted to the transductive setting and rely on the assumption that the underlying graph is known and fixed. In many settings, such as those arising in medical and healthcare applications, this assumption is not necessarily true since the graph may be noisy, partially- or even completely unknown, and one is thus interested in inferring it from the data. This is especially important in inductive settings when dealing with nodes not present in the graph at training time. Furthermore, sometimes such a graph itself may convey insights that are even more important than the downstream task. In this paper, we introduce Differentiable Graph Module (DGM), a learnable function predicting the edge probability in the graph relevant for the task, that can be combined with convolutional graph neural network layers and trained in an end-to-end fashion. We provide an extensive evaluation of applications from the domains of healthcare (disease prediction), brain imaging (gender and age prediction), computer graphics (3D point cloud segmentation), and computer vision (zero-shot learning). We show that our model provides a significant improvement over baselines both in transductive and inductive settings and achieves state-of-the-art results.}, bibtype = {article}, author = {Kazi, Anees and Cosmo, Luca and Navab, Nassir and Bronstein, Michael} }
@article{ title = {Pointer Graph Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2006.06380}, id = {fc0b796e-2ee2-3172-b28f-539f3d1a7722}, created = {2021-07-12T10:19:36.629Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:54.941Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Graph neural networks (GNNs) are typically applied to static graphs that are assumed to be known upfront. This static input structure is often informed purely by insight of the machine learning practitioner, and might not be optimal for the actual task the GNN is solving. In absence of reliable domain expertise, one might resort to inferring the latent graph structure, which is often difficult due to the vast search space of possible graphs. Here we introduce Pointer Graph Networks (PGNs) which augment sets or graphs with additional inferred edges for improved model generalisation ability. PGNs allow each node to dynamically point to another node, followed by message passing over these pointers. The sparsity of this adaptable graph structure makes learning tractable while still being sufficiently expressive to simulate complex algorithms. Critically, the pointing mechanism is directly supervised to model long-term sequences of operations on classical data structures, incorporating useful structural inductive biases from theoretical computer science. Qualitatively, we demonstrate that PGNs can learn parallelisable variants of pointer-based data structures, namely disjoint set unions and link/cut trees. PGNs generalise out-of-distribution to 5x larger test inputs on dynamic graph connectivity tasks, outperforming unrestricted GNNs and Deep Sets.}, bibtype = {article}, author = {Veličković, Petar and Buesing, Lars and Overlan, Matthew C. and Pascanu, Razvan and Vinyals, Oriol and Blundell, Charles}, number = {NeurIPS} }
@article{ title = {Improving Graph Neural Network Expressivity via Subgraph Isomorphism Counting}, type = {article}, year = {2020}, pages = {1-29}, websites = {http://arxiv.org/abs/2006.09252}, id = {de09a548-ec12-3b75-9eee-f57876203209}, created = {2021-07-12T10:19:36.710Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:20:01.629Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {While Graph Neural Networks (GNNs) have achieved remarkable results in a variety of applications, recent studies exposed important shortcomings in their ability to capture the structure of the underlying graph. It has been shown that the expressive power of standard GNNs is bounded by the Weisfeiler-Leman (WL) graph isomorphism test, from which they inherit proven limitations such as the inability to detect and count graph substructures. On the other hand, there is significant empirical evidence, e.g. in network science and bioinformatics, that substructures are often informative for downstream tasks, suggesting that it is desirable to design GNNs capable of leveraging this important source of information. To this end, we propose a novel topologically-aware message passing scheme based on substructure encoding. We show that our architecture allows incorporating domain-specific inductive biases and that it is strictly more expressive than the WL test. Importantly, in contrast to recent works on the expressivity of GNNs, we do not attempt to adhere to the WL hierarchy; this allows us to retain multiple attractive properties of standard GNNs such as locality and linear network complexity, while being able to disambiguate even hard instances of graph isomorphism. We extensively evaluate our method on graph classification and regression tasks and show state-of-the-art results on multiple datasets including molecular graphs and social networks.}, bibtype = {article}, author = {Bouritsas, Giorgos and Frasca, Fabrizio and Zafeiriou, Stefanos and Bronstein, Michael M.} }
@article{ title = {Directional Graph Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2010.02863}, id = {119e96ec-e5a7-38f9-8b6a-1103862d53ff}, created = {2021-07-12T10:19:36.728Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:20:03.945Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {The lack of anisotropic kernels in graph neural networks (GNNs) strongly limits their expressiveness, contributing to well-known issues such as over-smoothing. To overcome this limitation, we propose the first globally consistent anisotropic kernels for GNNs, allowing for graph convolutions that are defined according to topologicaly-derived directional flows. First, by defining a vector field in the graph, we develop a method of applying directional derivatives and smoothing by projecting node-specific messages into the field. Then, we propose the use of the Laplacian eigenvectors as such vector field. We show that the method generalizes CNNs on an $n$-dimensional grid and is provably more discriminative than standard GNNs regarding the Weisfeiler-Lehman 1-WL test. We evaluate our method on different standard benchmarks and see a relative error reduction of 8% on the CIFAR10 graph dataset and 11% to 32% on the molecular ZINC dataset, and a relative increase in precision of 1.6% on the MolPCBA dataset. An important outcome of this work is that it enables graph networks to embed directions in an unsupervised way, thus allowing a better representation of the anisotropic features in different physical or biological problems.}, bibtype = {article}, author = {Beaini, Dominique and Passaro, Saro and Létourneau, Vincent and Hamilton, William L. and Corso, Gabriele and Liò, Pietro} }
@article{ title = {Solving Mixed Integer Programs Using Neural Networks}, type = {article}, year = {2020}, keywords = {deep learning,discrete optimization,first version december 2020,graph networks,history,mixed integer programming}, pages = {1-57}, websites = {http://arxiv.org/abs/2012.13349}, id = {4363b46f-6a12-37f4-b0bd-5ca25d3e34b0}, created = {2021-07-12T10:19:36.836Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:45.945Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Mixed Integer Programming (MIP) solvers rely on an array of sophisticated heuristics developed with decades of research to solve large-scale MIP instances encountered in practice. Machine learning offers to automatically construct better heuristics from data by exploiting shared structure among instances in the data. This paper applies learning to the two key sub-tasks of a MIP solver, generating a high-quality joint variable assignment, and bounding the gap in objective value between that assignment and an optimal one. Our approach constructs two corresponding neural network-based components, Neural Diving and Neural Branching, to use in a base MIP solver such as SCIP. Neural Diving learns a deep neural network to generate multiple partial assignments for its integer variables, and the resulting smaller MIPs for un-assigned variables are solved with SCIP to construct high quality joint assignments. Neural Branching learns a deep neural network to make variable selection decisions in branch-and-bound to bound the objective value gap with a small tree. This is done by imitating a new variant of Full Strong Branching we propose that scales to large instances using GPUs. We evaluate our approach on six diverse real-world datasets, including two Google production datasets and MIPLIB, by training separate neural networks on each. Most instances in all the datasets combined have $10^3-10^6$ variables and constraints after presolve, which is significantly larger than previous learning approaches. Comparing solvers with respect to primal-dual gap averaged over a held-out set of instances, the learning-augmented SCIP is 2x to 10x better on all datasets except one on which it is $10^5$x better, at large time limits. To the best of our knowledge, ours is the first learning approach to demonstrate such large improvements over SCIP on both large-scale real-world application datasets and MIPLIB.}, bibtype = {article}, author = {Nair, Vinod and Bartunov, Sergey and Gimeno, Felix and von Glehn, Ingrid and Lichocki, Pawel and Lobov, Ivan and O'Donoghue, Brendan and Sonnerat, Nicolas and Tjandraatmadja, Christian and Wang, Pengming and Addanki, Ravichandra and Hapuarachchi, Tharindi and Keck, Thomas and Keeling, James and Kohli, Pushmeet and Ktena, Ira and Li, Yujia and Vinyals, Oriol and Zwols, Yori} }
@article{ title = {Batch Normalization Biases Residual Blocks Towards the Identity Function in Deep Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2002.10444}, id = {7de61b3e-1ab4-3ff0-938a-5cb9bf0fac79}, created = {2021-07-12T14:15:34.907Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:03.851Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Batch normalization dramatically increases the largest trainable depth of residual networks, and this benefit has been crucial to the empirical success of deep residual networks on a wide range of benchmarks. We show that this key benefit arises because, at initialization, batch normalization downscales the residual branch relative to the skip connection, by a normalizing factor on the order of the square root of the network depth. This ensures that, early in training, the function computed by normalized residual blocks in deep networks is close to the identity function (on average). We use this insight to develop a simple initialization scheme that can train deep residual networks without normalization. We also provide a detailed empirical study of residual networks, which clarifies that, although batch normalized networks can be trained with larger learning rates, this effect is only beneficial in specific compute regimes, and has minimal benefits when the batch size is small.}, bibtype = {article}, author = {De, Soham and Smith, Samuel L.}, number = {NeurIPS} }
@article{ title = {Stabilizing transformers for reinforcement learning}, type = {article}, year = {2020}, pages = {7443-7454}, volume = {PartF16814}, id = {4814fa84-51b1-3e55-be5c-ecf1c98cc473}, created = {2021-07-12T14:15:35.053Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:04.747Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Owing to their ability to both effectively integrate information over long time horizons and scale to massive amounts of data, self-attention architectures have recently shown breakthrough success in natural language processing (NLP). Harnessing the transformer’s ability to process long time horizons of information could provide a similar performance boost in partially observable reinforcement learning (RL) domains, but the large-scale transformers used in NLP have yet to be successfully applied to the RL setting. In this work we demonstrate that the standard transformer architecture is difficult to optimize, which was previously observed in the supervised learning setting but becomes especially pronounced with RL objectives. We propose architectural modifications that substantially improve the stability and learning speed of the original Transformer and XL variant. The proposed architecture, the Gated Transformer-XL (GTrXL), surpasses LSTMs on challenging memory environments and achieves state-of-the-art results on the multi-task DMLab-30 benchmark suite, exceeding the performance of an external memory architecture. We show that the GTrXL has stability and performance that consistently matches or exceeds a competitive LSTM baseline, including on more reactive tasks where memory is less critical.}, bibtype = {article}, author = {Parisotto, Emilio and Song, H. Francis and Rae, Jack W. and Pascanu, Razvan and Gulcehre, Caglar and Jayakumar, Siddhant M. and Jaderberg, Max and Kaufman, Raphaël Lopez and Clark, Aidan and Noury, Seb and Botvinick, Matthew M. and Heess, Nicolas and Hadsell, Raia}, journal = {37th International Conference on Machine Learning, ICML 2020} }
@article{ title = {Loss landscapes and optimization in over-parameterized non-linear systems and neural networks}, type = {article}, year = {2020}, pages = {1-31}, websites = {http://arxiv.org/abs/2003.00307}, id = {9ec73c3b-48ab-3851-ac46-854422477bba}, created = {2021-07-12T14:15:35.941Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:54.532Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {The success of deep learning is due, to a large extent, to the remarkable effectiveness of gradient-based optimization methods applied to large neural networks. The purpose of this work is to propose a modern view and a general mathematical framework for loss landscapes and efficient optimization in over-parameterized machine learning models and systems of non-linear equations, a setting that includes over-parameterized deep neural networks. Our starting observation is that optimization problems corresponding to such systems are generally not convex, even locally. We argue that instead they satisfy PL$^*$, a variant of the Polyak-Lojasiewicz condition on most (but not all) of the parameter space, which guarantees both the existence of solutions and efficient optimization by (stochastic) gradient descent (SGD/GD). The PL$^*$ condition of these systems is closely related to the condition number of the tangent kernel associated to a non-linear system showing how a PL$^*$-based non-linear theory parallels classical analyses of over-parameterized linear equations. We show that wide neural networks satisfy the PL$^*$ condition, which explains the (S)GD convergence to a global minimum. Finally we propose a relaxation of the PL$^*$ condition applicable to "almost" over-parameterized systems.}, bibtype = {article}, author = {Liu, Chaoyue and Zhu, Libin and Belkin, Mikhail} }
@article{ title = {Implicit Gradient Regularization}, type = {article}, year = {2020}, pages = {1-25}, websites = {http://arxiv.org/abs/2009.11162}, id = {a26612d5-45ce-335c-a07d-240eb2c50aa6}, created = {2021-07-12T14:15:36.073Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:59.452Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Gradient descent can be surprisingly good at optimizing deep neural networks without overfitting and without explicit regularization. We find that the discrete steps of gradient descent implicitly regularize models by penalizing gradient descent trajectories that have large loss gradients. We call this Implicit Gradient Regularization (IGR) and we use backward error analysis to calculate the size of this regularization. We confirm empirically that implicit gradient regularization biases gradient descent toward flat minima, where test errors are small and solutions are robust to noisy parameter perturbations. Furthermore, we demonstrate that the implicit gradient regularization term can be used as an explicit regularizer, allowing us to control this gradient regularization directly. More broadly, our work indicates that backward error analysis is a useful theoretical approach to the perennial question of how learning rate, model size, and parameter regularization interact to determine the properties of overparameterized models optimized with gradient descent.}, bibtype = {article}, author = {Barrett, David G. T. and Dherin, Benoit} }
@article{ title = {Machine Learning for Healthcare}, type = {article}, year = {2020}, id = {b10004be-e3e6-374b-ae59-378b0b309fa5}, created = {2021-07-19T10:48:23.890Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:48:35.976Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, abstract = {Machine learning can be used to make sense of healthcare data. Proba-bilistic machine learning models help provide a complete picture of observed data in healthcare. In this review, we examine how probabilistic machine learning can advance healthcare. We consider challenges in the predictive model building pipeline where probabilistic models can be beneficial including calibration and missing data. Beyond predictive models, we also investigate the utility of probabilistic machine learning models in phenotyping, in generative models for clinical use cases, and in reinforcement learning.}, bibtype = {article}, author = {Agrawal, Rashmi and Chatterjee, Jyotir Moy and Kumar, Abhishek and Rathore, Pramod Singh and Le, Dac-Nhuong}, doi = {10.1201/9780429330131}, journal = {Machine Learning for Healthcare} }
@article{ title = {Introduction to Deep Learning}, type = {article}, year = {2020}, pages = {1-11}, id = {95cd0b66-b2ce-3156-b338-145f7bf39e1d}, created = {2021-07-19T10:48:23.913Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:49:13.783Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, abstract = {Machine learning systems, with shallow or deep architectures, have ability to learn and improve with experience. The process of machine learning begins with the raw data which is used for extracting useful information that helps in decision-making. The primary aim is to allow a machine to learn useful information just like humans do. At abstract level, machine learning can be carried out using following approaches.}, bibtype = {article}, author = {Wani, M. Arif and Bhat, Farooq Ahmad and Afzal, Saduf and Khan, Asif Iqbal}, doi = {10.1007/978-981-13-6794-6_1} }
@article{ title = {Exploring Self-attention for Image Recognition}, type = {article}, year = {2020}, pages = {10073-10082}, websites = {https://arxiv.org/abs/2004.13621v1}, month = {4}, publisher = {IEEE Computer Society}, day = {28}, id = {f6ce209a-c54e-3c64-975a-863ea51241a2}, created = {2021-07-19T15:40:53.471Z}, accessed = {2021-07-19}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T15:40:56.315Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {5cd4d7ce-c2fb-4e91-ab80-35deeb123df5}, private_publication = {false}, abstract = {Recent work has shown that self-attention can serve as a basic building block for image recognition models. We explore variations of self-attention and assess their effectiveness for image recognition. We consider two forms of self-attention. One is pairwise self-attention, which generalizes standard dot-product attention and is fundamentally a set operator. The other is patchwise self-attention, which is strictly more powerful than convolution. Our pairwise self-attention networks match or outperform their convolutional counterparts, and the patchwise models substantially outperform the convolutional baselines. We also conduct experiments that probe the robustness of learned representations and conclude that self-attention networks may have significant benefits in terms of robustness and generalization.}, bibtype = {article}, author = {Zhao, Hengshuang and Jia, Jiaya and Koltun, Vladlen}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@book{ title = {Spectral Graph Attention Network}, type = {book}, year = {2020}, source = {DLG-KDD '21: The Sixth International Workshop on Deep Learning on Graphs: Methods and Applications, August 14-18, 2021, Virtual Conference}, keywords = {deep learning,graph neural,graph representation learning,graph representation learning, graph spectral anal,graph spectral analysis,networks}, volume = {1}, issue = {1}, websites = {http://arxiv.org/abs/2003.07450}, publisher = {Association for Computing Machinery}, id = {5417ac11-5ae6-388f-a421-d997346496bb}, created = {2021-08-04T09:51:19.886Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-04T09:51:28.937Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Variants of Graph Neural Networks (GNNs) for representation learning have been proposed recently and achieved fruitful results in various fields. Among them, graph attention networks (GATs) first employ a self-attention strategy to learn attention weights for each edge in the spatial domain. However, learning the attentions over edges only pays attention to the local information of graphs and greatly increases the number of parameters. In this paper, we first introduce attentions in the spectral domain of graphs. Accordingly, we present Spectral Graph Attention Network (SpGAT) that learn representations for different frequency components regarding weighted filters and graph wavelets bases. In this way, SpGAT can better capture global patterns of graphs in an efficient manner with much fewer learned parameters than that of GAT. We thoroughly evaluate the performance of SpGAT in the semi-supervised node classification task and verified the effectiveness of the learned attentions in the spectral domain.}, bibtype = {book}, author = {Chang, Heng and Rong, Yu and Xu, Tingyang and Huang, Wenbing and Sojoudi, Somayeh and Huang, Junzhou and Zhu, Wenwu} }
@article{ title = {A Hierarchical Graph Network for 3D Object Detection on Point Clouds}, type = {article}, year = {2020}, pages = {389-398}, id = {0b000570-27ac-3757-937f-b7c4b9c42876}, created = {2021-08-04T13:05:07.916Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T11:27:41.730Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {3D object detection on point clouds finds many applications. However, most known point cloud object detection methods did not adequately accommodate the characteristics (e.g., sparsity) of point clouds, and thus some key semantic information (e.g., shape information) is not well captured. In this paper, we propose a new graph convolution (GConv) based hierarchical graph network (HGNet) for 3D object detection, which processes raw point clouds directly to predict 3D bounding boxes. HGNet effectively captures the relationship of the points and utilizes the multi-level semantics for object detection. Specially, we propose a novel shape-attentive GConv (SA-GConv) to capture the local shape features, by modelling the relative geometric positions of points to describe object shapes. An SA-GConv based U-shape network captures the multi-level features, which are mapped into an identical feature space by an improved voting module and then further utilized to generate proposals. Next, a new GConv based Proposal Reasoning Module reasons on the proposals considering the global scene semantics, and the bounding boxes are then predicted. Consequently, our new framework outperforms state-of-the-art methods on two large-scale point cloud datasets, by ~4% mean average precision (mAP) on SUN RGB-D and by ~3% mAP on ScanNet-V2.}, bibtype = {article}, author = {Chen, Jintai and Lei, Biwen and Song, Qingyu and Ying, Haochao and Chen, Danny Z. and Wu, Jian}, doi = {10.1109/CVPR42600.2020.00047}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Deep Learning for 3D Point Clouds: A Survey}, type = {article}, year = {2020}, pages = {1-1}, volume = {8828}, id = {abcfde07-c42b-3225-ba45-d1359adc7b51}, created = {2021-08-04T13:05:07.916Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-02T06:33:45.266Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Point cloud learning has lately attracted increasing attention due to its wide applications in many areas, such as computer vision, autonomous driving, and robotics. As a dominating technique in AI, deep learning has been successfully used to solve various 2D vision problems. However, deep learning on point clouds is still in its infancy due to the unique challenges faced by the processing of point clouds with deep neural networks. Recently, deep learning on point clouds has become even thriving, with numerous methods being proposed to address different problems in this area. To stimulate future research, this paper presents a comprehensive review of recent progress in deep learning methods for point clouds. It covers three major tasks, including 3D shape classification, 3D object detection and tracking, and 3D point cloud segmentation. It also presents comparative results on several publicly available datasets, together with insightful observations and inspiring future research directions.}, bibtype = {article}, author = {Guo, Yulan and Wang, Hanyun and Hu, Qingyong and Liu, Hao and Liu, Li and Bennamoun, Mohammed}, doi = {10.1109/tpami.2020.3005434}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {c} }
@article{ title = {FGCN: Deep feature-based graph convolutional network for semantic segmentation of urban 3D point clouds}, type = {article}, year = {2020}, pages = {778-787}, volume = {2020-June}, id = {f3030c55-31c3-3871-ba40-b2d6b47475e6}, created = {2021-08-04T13:05:08.029Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.874Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {AliKhan2020}, private_publication = {false}, abstract = {Directly processing 3D point clouds using convolutional neural networks (CNNs) is a highly challenging task primarily due to the lack of explicit neighborhood relation-ship between points in 3D space. Several researchers have tried to cope with this problem using a preprocessing step of voxelization. Although, this allows to translate the existing CNN architectures to process 3D point clouds but, in addition to computational and memory constraints, it poses quantization artifacts which limits the accurate inference of the underlying object's structure in the illuminated scene. In this paper, we have introduced a more stable and effective end-to-end architecture to classify raw 3D point clouds from indoor and outdoor scenes. In the proposed methodology, we encode the spatial arrangement of neighbouring 3D points inside an undirected symmetrical graph, which is passed along with features extracted from a 2D CNN to a Graph Convolutional Network (GCN) that contains three layers of localized graph convolutions to generate a complete segmentation map. The proposed network achieves on par or even better than state-of-the-art results on tasks like semantic scene parsing, part segmentation and urban classification on three standard benchmark datasets.}, bibtype = {article}, author = {Ali Khan, Saqib and Shi, Yilei and Shahzad, Muhammad and Xiang Zhu, Xiao}, doi = {10.1109/CVPRW50498.2020.00107}, journal = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops} }
@article{ title = {Directed Graph Convolutional Network}, type = {article}, year = {2020}, keywords = {graph neural networks,proximity,semi-supervised learning}, websites = {http://arxiv.org/abs/2004.13970}, id = {71ac82f9-ad57-324a-b97c-bdf7307e6beb}, created = {2021-08-04T13:05:08.044Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:20.823Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph Convolutional Networks (GCNs) have been widely used due to their outstanding performance in processing graph-structured data. However, the undirected graphs limit their application scope. In this paper, we extend spectral-based graph convolution to directed graphs by using first- and second-order proximity, which can not only retain the connection properties of the directed graph, but also expand the receptive field of the convolution operation. A new GCN model, called DGCN, is then designed to learn representations on the directed graph, leveraging both the first- and second-order proximity information. We empirically show the fact that GCNs working only with DGCNs can encode more useful information from graph and help achieve better performance when generalized to other models. Moreover, extensive experiments on citation networks and co-purchase datasets demonstrate the superiority of our model against the state-of-the-art methods.}, bibtype = {article}, author = {Tong, Zekun and Liang, Yuxuan and Sun, Changsheng and Rosenblum, David S. and Lim, Andrew} }
@article{ title = {Scalable Graph Convolutional Networks with Fast Localized Spectral Filter for Directed Graphs}, type = {article}, year = {2020}, keywords = {Graph convolutional networks,directed graph Laplacian,semisupervised learning,spectral graph theory}, pages = {105634-105644}, volume = {8}, id = {a628a487-80af-3403-a898-a0c9993a6e70}, created = {2021-08-04T13:05:08.165Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-04T13:05:46.160Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph convolutional neural netwoks (GCNNs) have been emerged to handle graph-structured data in recent years. Most existing GCNNs are either spatial approaches working on neighborhood of each node, or spectral approaches based on graph Laplacian. Compared with spatial-based GCNNs, spectral-based GCNNs are capable of highly exploiting graph structure information, but always regard graphs undiredcted. Actually, there are many scenarios where the graph structures are directed, such as social networks, citation networks, etc. Treating graphs undirected may lose important information, which is helpful for graph learning tasks. This motivate us to construct a spectral-based GCNN for directed graphs. In this paper, we propose a scalable graph convolutional neural network with fast localized convolution operators derived from directed graph Laplacian, which is called fast directed graph convolutional network (FDGCN). FDGCN can directly work on directed graphs and can scale to large graphs as the convolution operation is linear with the number of edges. Furthermore, we find that FDGCN can unify the graph convolutional network (GCN), which is a classic spectral-based GCNN. The mechanism of FDGCN is thoroughly analyzed from spatial aggregation point of view. Since previous work has confirmed that considering uncertainty of graph could promote GCN a lot, the proposed FDGCN is further enhanced through extra training epochs on random graphs generated by mixed membership stochastic block model (MMSBM). Experiments are conducted for semi-supervised node classification tasks to evaluate the performance of FDGCN. Results show that our model can outperform or match state-of-the-art models in most cases.}, bibtype = {article}, author = {Li, Chensheng and Qin, Xiaowei and Xu, Xiaodong and Yang, Dujia and Wei, Guo}, doi = {10.1109/ACCESS.2020.2999520}, journal = {IEEE Access} }
@article{ title = {Point clouds learning with attention-based graph convolution networks}, type = {article}, year = {2020}, keywords = {Attention mechanism,Graph network,Point clouds,Semantic segmentation}, pages = {245-255}, volume = {402}, websites = {https://doi.org/10.1016/j.neucom.2020.03.086}, publisher = {Elsevier B.V.}, id = {c0eb66ba-d187-33cb-95f4-e4a1af796f36}, created = {2021-08-04T13:05:08.170Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:23.820Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Point clouds, as a kind of 3D objects representation, are the most primitive outputs obtained by 3D sensors. Unlike 2D images, point clouds are disordered and unstructured. Hence the classification techniques such as the convolution neural network are not applicable to point cloud analysis directly. To solve this problem, we propose a novel network to extract point clouds feature, named attention-based graph convolutional network (AGCN). Taking the learning process as a message propagation between adjacent points, we specifically introduce attention mechanism to construct a point attention layer for analyzing the relationship between local points feature. The object classification is implemented by stacking multiple layers of point attention layer. In addition, the proposed network is extended to an attention-based encoder-decoder structure for segmentation tasks. We also introduce an additional global graph structure network to compensate for the relative location information of the individual points in the graph structure network. Experimental results show that our network has lower computational complexity and faster convergence speed. Compared with existing methods, the proposed network can achieve comparable performance in classification and segmentation tasks.}, bibtype = {article}, author = {Xie, Zhuyang and Chen, Junzhou and Peng, Bo}, doi = {10.1016/j.neucom.2020.03.086}, journal = {Neurocomputing} }
@article{ title = {Building extraction from airborne multi-spectral LiDAR point clouds based on graph geometric moments convolutional neural networks}, type = {article}, year = {2020}, keywords = {Airborne multi-spectral LiDAR point clouds,Building extraction,Convolutional neural networks,Graph geometric moments}, pages = {1-24}, volume = {12}, id = {668f9d51-87c7-322b-8692-2c38fd904a94}, created = {2021-08-04T13:05:08.277Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:22.431Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Building extraction has attracted much attentions for decades as a prerequisite for many applications and is still a challenging topic in the field of photogrammetry and remote sensing. Due to the lack of spectral information, massive data processing, and approach universality, building extraction from point clouds is still a thorny and challenging problem. In this paper, a novel deep-learning-based framework is proposed for building extraction from point cloud data. Specifically, first, a sample generation method is proposed to split the raw preprocessed multi-spectral light detection and ranging (LiDAR) data into numerous samples, which are directly fed into convolutional neural networks and completely cover the original inputs. Then, a graph geometric moments (GGM) convolution is proposed to encode the local geometric structure of point sets. In addition, a hierarchical architecture equipped with GGM convolution, called GGM convolutional neural networks, is proposed to train and recognize building points. Finally, the test scenes with varying sizes can be fed into the framework and obtain a point-wise extraction result. We evaluate the proposed framework and methods on the airborne multi-spectral LiDAR point clouds collected by an Optech Titan system. Compared with previous state-of-the-art networks, which are designed for point cloud segmentation, our method achieves the best performance with a correctness of 95.1%, a completeness of 93.7%, an F-measure of 94.4%, and an intersection over union (IoU) of 89.5% on two test areas. The experimental results confirm the effectiveness and efficiency of the proposed framework and methods.}, bibtype = {article}, author = {Li, Dilong and Shen, Xin and Yu, Yongtao and Guan, Haiyan and Li, Jonathan and Zhang, Guo and Li, Deren}, doi = {10.3390/rs12193186}, journal = {Remote Sensing}, number = {19} }
@article{ title = {Spherical Kernel for Efficient Graph Convolution on 3D Point Clouds}, type = {article}, year = {2020}, pages = {1-1}, volume = {X}, publisher = {IEEE}, id = {d12a4b52-c988-3014-ac0a-d26351b53b61}, created = {2021-08-04T13:05:08.282Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T11:27:41.874Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We propose a spherical kernel for efficient graph convolution of 3D point clouds. Our metric-based kernels systematically quantize the local 3D space to identify distinctive geometric relationships in the data. Similar to the regular grid CNN kernels, the spherical kernel maintains translation-invariance and asymmetry properties, where the former guarantees weight sharing among similar local structures in the data and the latter facilitates fine geometric learning. The proposed kernel is applied to graph neural networks without edge-dependent filter generation, making it computationally attractive for large point clouds. In our graph networks, each vertex is associated with a single point location and edges connect the neighborhood points within a defined range. The graph gets coarsened in the network with farthest point sampling. Analogous to the standard CNNs, we define pooling and unpooling operations for our network. We demonstrate the effectiveness of the proposed spherical kernel with graph neural networks for point cloud classification and semantic segmentation using ModelNet, ShapeNet, RueMonge2014, ScanNet and S3DIS datasets. The source code and the trained models can be downloaded from https://github.com/hlei-ziyan/SPH3D-GCN.}, bibtype = {article}, author = {Lei, Huan and Akhtar, Naveed and Mian, Ajmal}, doi = {10.1109/tpami.2020.2983410}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {X} }
@article{ title = {Graph neural networks: A review of methods and applications}, type = {article}, year = {2020}, pages = {57-81}, volume = {1}, websites = {https://doi.org/10.1016/j.aiopen.2021.01.001}, publisher = {Elsevier Ltd}, id = {39388a6c-0465-3da0-92e1-c6fa1b38208f}, created = {2021-08-17T08:06:02.964Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-01T15:59:25.056Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhou2020}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, abstract = {Lots of learning tasks require dealing with graph data which contains rich relation information among elements. Modeling physics systems, learning molecular fingerprints, predicting protein interface, and classifying diseases demand a model to learn from graph inputs. In other domains such as learning from non-structural data like texts and images, reasoning on extracted structures (like the dependency trees of sentences and the scene graphs of images) is an important research topic which also needs graph reasoning models. Graph neural networks (GNNs) are neural models that capture the dependence of graphs via message passing between the nodes of graphs. In recent years, variants of GNNs such as graph convolutional network (GCN), graph attention network (GAT), graph recurrent network (GRN) have demonstrated ground-breaking performances on many deep learning tasks. In this survey, we propose a general design pipeline for GNN models and discuss the variants of each component, systematically categorize the applications, and propose four open problems for future research.}, bibtype = {article}, author = {Zhou, Jie and Cui, Ganqu and Hu, Shengding and Zhang, Zhengyan and Yang, Cheng and Liu, Zhiyuan and Wang, Lifeng and Li, Changcheng and Sun, Maosong}, doi = {10.1016/j.aiopen.2021.01.001}, journal = {AI Open}, number = {September 2020} }
@article{ title = {An Experimental Study of the Transferability of Spectral Graph Networks}, type = {article}, year = {2020}, keywords = {ability,benchmarking,fea-,graph networks,it demonstrates the ability,learn consistent and discriminative,of the system to,spectral convolution,their generalization capability,transfer-}, websites = {http://arxiv.org/abs/2012.10258}, id = {8e46a5b6-852f-36c1-89a2-1e6456f8f974}, created = {2021-08-20T06:55:38.317Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.526Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Nilsson2020}, private_publication = {false}, abstract = {Spectral graph convolutional networks are generalizations of standard convolutional networks for graph-structured data using the Laplacian operator. A common misconception is the instability of spectral filters, i.e. the impossibility to transfer spectral filters between graphs of variable size and topology. This misbelief has limited the development of spectral networks for multi-graph tasks in favor of spatial graph networks. However, recent works have proved the stability of spectral filters under graph perturbation. Our work complements and emphasizes further the high quality of spectral transferability by benchmarking spectral graph networks on tasks involving graphs of different size and connectivity. Numerical experiments exhibit favorable performance on graph regression, graph classification, and node classification problems on two graph benchmarks. The implementation of our experiments is available on GitHub for reproducibility.}, bibtype = {article}, author = {Nilsson, Axel and Bresson, Xavier} }
@article{ title = {Benchmarking Graph Neural Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2003.00982}, id = {9485c177-1920-3e0e-94aa-97ca55245614}, created = {2021-08-20T08:40:31.821Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-20T08:40:34.991Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph neural networks (GNNs) have become the standard toolkit for analyzing and learning from data on graphs. As the field grows, it becomes critical to identify key architectures and validate new ideas that generalize to larger, more complex datasets. Unfortunately, it has been increasingly difficult to gauge the effectiveness of new models in the absence of a standardized benchmark with consistent experimental settings. In this paper, we introduce a reproducible GNN benchmarking framework, with the facility for researchers to add new models conveniently for arbitrary datasets. We demonstrate the usefulness of our framework by presenting a principled investigation into the recent Weisfeiler-Lehman GNNs (WL-GNNs) compared to message passing-based graph convolutional networks (GCNs) for a variety of graph tasks, i.e. graph regression/classification and node/link prediction, with medium-scale datasets.}, bibtype = {article}, author = {Dwivedi, Vijay Prakash and Joshi, Chaitanya K. and Laurent, Thomas and Bengio, Yoshua and Bresson, Xavier} }
@article{ title = {How Neural Networks Extrapolate: From Feedforward to Graph Neural Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2009.11848}, id = {797fb6be-b096-3d1e-bd3a-189ab182e9c6}, created = {2021-08-20T10:21:49.091Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:22.773Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We study how neural networks trained by gradient descent extrapolate, i.e., what they learn outside the support of the training distribution. Previous works report mixed empirical results when extrapolating with neural networks: while feedforward neural networks, a.k.a. multilayer perceptrons (MLPs), do not extrapolate well in certain simple tasks, Graph Neural Networks (GNNs) -- structured networks with MLP modules -- have shown some success in more complex tasks. Working towards a theoretical explanation, we identify conditions under which MLPs and GNNs extrapolate well. First, we quantify the observation that ReLU MLPs quickly converge to linear functions along any direction from the origin, which implies that ReLU MLPs do not extrapolate most nonlinear functions. But, they can provably learn a linear target function when the training distribution is sufficiently "diverse". Second, in connection to analyzing the successes and limitations of GNNs, these results suggest a hypothesis for which we provide theoretical and empirical evidence: the success of GNNs in extrapolating algorithmic tasks to new data (e.g., larger graphs or edge weights) relies on encoding task-specific non-linearities in the architecture or features. Our theoretical analysis builds on a connection of over-parameterized networks to the neural tangent kernel. Empirically, our theory holds across different training settings.}, bibtype = {article}, author = {Xu, Keyulu and Zhang, Mozhi and Li, Jingling and Du, Simon S. and Kawarabayashi, Ken-ichi and Jegelka, Stefanie} }
@article{ title = {Generalization and representational limits of graph neural networks}, type = {article}, year = {2020}, pages = {3377-3388}, volume = {PartF16814}, id = {c1c28a19-8490-3315-86d4-d2984b1f10fe}, created = {2021-08-20T10:21:49.097Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:22.608Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We address two fundamental questions about graph neural networks (GNNs). First, we prove that several important graph properties, e.g., shortest/longest cycle, diameter, or certain motifs, cannot be computed by GNNs that rely entirely on local information. Such GNNs include the standard message passing models, and more powerful variants that exploit local graph structure (e.g., via relative orientation of messages, or local port ordering) to distinguish neighbors of each node. Our treatment includes a novel graph-theoretic formalism. Second, we provide the first data dependent generalization bounds for message passing GNNs. This analysis explicitly accounts for the local permutation invariance of GNNs. Our bounds are much tighter than existing VC-dimension based guarantees for GNNs, and are comparable to Rademacher bounds for recurrent neural networks.}, bibtype = {article}, author = {Garg, Vikas K. and Jegelka, Stefanie and Jaakkola, Tommi}, journal = {37th International Conference on Machine Learning, ICML 2020} }
@article{ title = {Towards Deeper Graph Neural Networks}, type = {article}, year = {2020}, keywords = {deep learning,graph neural networks,graph representation learning}, pages = {338-348}, id = {21780edf-1b8a-3c74-8733-bf47b8656e87}, created = {2021-08-20T10:21:49.207Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:20.128Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph neural networks have shown significant success in the field of graph representation learning. Graph convolutions perform neighborhood aggregation and represent one of the most important graph operations. Nevertheless, one layer of these neighborhood aggregation methods only consider immediate neighbors, and the performance decreases when going deeper to enable larger receptive fields. Several recent studies attribute this performance deterioration to the over-smoothing issue, which states that repeated propagation makes node representations of different classes indistinguishable. In this work, we study this observation systematically and develop new insights towards deeper graph neural networks. First, we provide a systematical analysis on this issue and argue that the key factor compromising the performance significantly is the entanglement of representation transformation and propagation in current graph convolution operations. After decoupling these two operations, deeper graph neural networks can be used to learn graph node representations from larger receptive fields. We further provide a theoretical analysis of the above observation when building very deep models, which can serve as a rigorous and gentle description of the over-smoothing issue. Based on our theoretical and empirical analysis, we propose Deep Adaptive Graph Neural Network (DAGNN) to adaptively incorporate information from large receptive fields. A set of experiments on citation, co-authorship, and co-purchase datasets have confirmed our analysis and insights and demonstrated the superiority of our proposed methods.}, bibtype = {article}, author = {Liu, Meng and Gao, Hongyang and Ji, Shuiwang}, doi = {10.1145/3394486.3403076}, journal = {Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining} }
@article{ title = {SuperGlue: Learning Feature Matching with Graph Neural Networks}, type = {article}, year = {2020}, pages = {4937-4946}, id = {46eb6a77-5da7-39c3-b302-332c8dcd8769}, created = {2021-08-20T10:21:49.208Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:14.959Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This paper introduces SuperGlue, a neural network that matches two sets of local features by jointly finding correspondences and rejecting non-matchable points. Assignments are estimated by solving a differentiable optimal transport problem, whose costs are predicted by a graph neural network. We introduce a flexible context aggregation mechanism based on attention, enabling SuperGlue to reason about the underlying 3D scene and feature assignments jointly. Compared to traditional, hand-designed heuristics, our technique learns priors over geometric transformations and regularities of the 3D world through end-to-end training from image pairs. SuperGlue outperforms other learned approaches and achieves state-of-the-art results on the task of pose estimation in challenging real-world indoor and outdoor environments. The proposed method performs matching in real-time on a modern GPU and can be readily integrated into modern SfM or SLAM systems. The code and trained weights are publicly available at github.com/magicleap/SuperGluePretrainedNetwork.}, bibtype = {article}, author = {Sarlin, Paul Edouard and Detone, Daniel and Malisiewicz, Tomasz and Rabinovich, Andrew}, doi = {10.1109/CVPR42600.2020.00499}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Point-GNN: Graph neural network for 3D object detection in a point cloud}, type = {article}, year = {2020}, pages = {1708-1716}, id = {30991788-703e-3660-8c6b-9d411f0a9164}, created = {2021-08-20T10:21:49.216Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:21.758Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In this paper, we propose a graph neural network to detect objects from a LiDAR point cloud. Towards this end, we encode the point cloud efficiently in a fixed radius near-neighbors graph. We design a graph neural network, named Point-GNN, to predict the category and shape of the object that each vertex in the graph belongs to. In Point-GNN, we propose an auto-registration mechanism to reduce translation variance, and also design a box merging and scoring operation to combine detections from multiple vertices accurately. Our experiments on the KITTI benchmark show the proposed approach achieves leading accuracy using the point cloud alone and can even surpass fusion-based algorithms. Our results demonstrate the potential of using the graph neural network as a new approach for 3D object detection. The code is available at https://github.com/WeijingShi/Point-GNN.}, bibtype = {article}, author = {Shi, Weijing and Rajkumar, Ragunathan}, doi = {10.1109/CVPR42600.2020.00178}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {MEasuring and I Mproving the U Se of G Raph}, type = {article}, year = {2020}, pages = {1-15}, id = {3fae7f12-a83b-3858-ad96-516cec8adb90}, created = {2021-08-20T10:21:49.317Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T11:27:42.024Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {The, Mproving and Of, S E}, number = {2018} }
@article{ title = {Exploring Self-attention for Image Recognition}, type = {article}, year = {2020}, pages = {10073-10082}, websites = {https://arxiv.org/abs/2004.13621v1}, month = {4}, publisher = {IEEE Computer Society}, day = {28}, id = {ada541b0-9e87-352b-93ba-06d1debc6b7e}, created = {2021-08-26T06:16:19.758Z}, accessed = {2021-08-26}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-26T06:16:21.891Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, abstract = {Recent work has shown that self-attention can serve as a basic building block for image recognition models. We explore variations of self-attention and assess their effectiveness for image recognition. We consider two forms of self-attention. One is pairwise self-attention, which generalizes standard dot-product attention and is fundamentally a set operator. The other is patchwise self-attention, which is strictly more powerful than convolution. Our pairwise self-attention networks match or outperform their convolutional counterparts, and the patchwise models substantially outperform the convolutional baselines. We also conduct experiments that probe the robustness of learned representations and conclude that self-attention networks may have significant benefits in terms of robustness and generalization.}, bibtype = {article}, author = {Zhao, Hengshuang and Jia, Jiaya and Koltun, Vladlen}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Exploring Self-attention for Image Recognition}, type = {article}, year = {2020}, pages = {10073-10082}, websites = {https://arxiv.org/abs/2004.13621v1}, month = {4}, publisher = {IEEE Computer Society}, day = {28}, id = {927e10d7-a23f-3087-9324-06781769faba}, created = {2021-08-26T06:19:09.936Z}, accessed = {2021-08-26}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-26T06:19:12.345Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, abstract = {Recent work has shown that self-attention can serve as a basic building block for image recognition models. We explore variations of self-attention and assess their effectiveness for image recognition. We consider two forms of self-attention. One is pairwise self-attention, which generalizes standard dot-product attention and is fundamentally a set operator. The other is patchwise self-attention, which is strictly more powerful than convolution. Our pairwise self-attention networks match or outperform their convolutional counterparts, and the patchwise models substantially outperform the convolutional baselines. We also conduct experiments that probe the robustness of learned representations and conclude that self-attention networks may have significant benefits in terms of robustness and generalization.}, bibtype = {article}, author = {Zhao, Hengshuang and Jia, Jiaya and Koltun, Vladlen}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Exploring Self-attention for Image Recognition}, type = {article}, year = {2020}, pages = {10073-10082}, websites = {https://arxiv.org/abs/2004.13621v1}, month = {4}, publisher = {IEEE Computer Society}, day = {28}, id = {4772fa88-ca6b-3808-b693-176dd213b1b1}, created = {2021-08-26T06:21:01.130Z}, accessed = {2021-08-26}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-26T06:21:04.089Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Recent work has shown that self-attention can serve as a basic building block for image recognition models. We explore variations of self-attention and assess their effectiveness for image recognition. We consider two forms of self-attention. One is pairwise self-attention, which generalizes standard dot-product attention and is fundamentally a set operator. The other is patchwise self-attention, which is strictly more powerful than convolution. Our pairwise self-attention networks match or outperform their convolutional counterparts, and the patchwise models substantially outperform the convolutional baselines. We also conduct experiments that probe the robustness of learned representations and conclude that self-attention networks may have significant benefits in terms of robustness and generalization.}, bibtype = {article}, author = {Zhao, Hengshuang and Jia, Jiaya and Koltun, Vladlen}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {A Survey on Visual Transformer}, type = {article}, year = {2020}, keywords = {Computer Vision,High-level vision,Index Terms-Transformer,Low-level vision,Self-attention,Video !}, websites = {https://arxiv.org/abs/2012.12556v3}, month = {12}, day = {23}, id = {6c4a7d51-a8e5-367f-bb24-b2aecb46f84b}, created = {2021-08-30T06:37:15.527Z}, accessed = {2021-08-30}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-31T10:01:32.396Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Transformer, first applied to the field of natural language processing, is a type of deep neural network mainly based on the self-attention mechanism. Thanks to its strong representation capabilities, researchers are looking at ways to apply transformer to computer vision tasks. In a variety of visual benchmarks, transformer-based models perform similar to or better than other types of networks such as convolutional and recurrent networks. Given its high performance and no need for human-defined inductive bias, transformer is receiving more and more attention from the computer vision community. In this paper, we review these visual transformer models by categorizing them in different tasks and analyzing their advantages and disadvantages. The main categories we explore include the backbone network, high/mid-level vision, low-level vision, and video processing. We also take a brief look at the self-attention mechanism in computer vision, as it is the base component in transformer. Furthermore, we include efficient transformer methods for pushing transformer into real device-based applications. Toward the end of this paper, we discuss the challenges and provide several further research directions for visual transformers.}, bibtype = {article}, author = {Han, Kai and Wang, Yunhe and Chen, Hanting and Chen, Xinghao and Guo, Jianyuan and Liu, Zhenhua and Tang, Yehui and Xiao, An and Xu, Chunjing and Xu, Yixing and Yang, Zhaohui and Zhang, Yiman and Tao, Dacheng} }
@article{ title = {Taming Transformers for High-Resolution Image Synthesis}, type = {article}, year = {2020}, websites = {https://arxiv.org/abs/2012.09841v2}, month = {12}, day = {17}, id = {81f46d87-b241-3a86-bdb9-e1bb5fcb8f9b}, created = {2021-08-31T10:44:08.431Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-31T10:46:08.045Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Designed to learn long-range interactions on sequential data, transformers continue to show state-of-the-art results on a wide variety of tasks. In contrast to CNNs, they contain no inductive bias that prioritizes local interactions. This makes them expressive, but also computationally infeasible for long sequences, such as high-resolution images. We demonstrate how combining the effectiveness of the inductive bias of CNNs with the expressivity of transformers enables them to model and thereby synthesize high-resolution images. We show how to (i) use CNNs to learn a context-rich vocabulary of image constituents, and in turn (ii) utilize transformers to efficiently model their composition within high-resolution images. Our approach is readily applied to conditional synthesis tasks, where both non-spatial information, such as object classes, and spatial information, such as segmentations, can control the generated image. In particular, we present the first results on semantically-guided synthesis of megapixel images with transformers. Project page at https://compvis.github.io/taming-transformers/ .}, bibtype = {article}, author = {Esser, Patrick and Rombach, Robin and Ommer, Björn} }
@misc{ title = {Generative Pretraining From Pixels}, type = {misc}, year = {2020}, pages = {1691-1703}, websites = {https://proceedings.mlr.press/v119/chen20s.html}, month = {11}, publisher = {PMLR}, day = {21}, id = {95f2a99c-ee96-3631-bb1b-52c7b8084320}, created = {2021-08-31T10:51:04.927Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-31T10:51:09.845Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Inspired by progress in unsupervised representation learning for natural language, we examine whether similar models can learn useful representations for images. We train a sequence Transformer to auto-regressively predict pixels, without incorporating knowledge of the 2D input structure. Despite training on low-resolution ImageNet without labels, we find that a GPT-2 scale model learns strong image representations as measured by linear probing, fine-tuning, and low-data classification. On CIFAR-10, we achieve 96.3% accuracy with a linear probe, outperforming a supervised Wide ResNet, and 99.0% accuracy with full fine-tuning, matching the top supervised pre-trained models. We are also competitive with self-supervised benchmarks on ImageNet when substituting pixels for a VQVAE encoding, achieving 69.0% top-1 accuracy on a linear probe of our features.}, bibtype = {misc}, author = {Chen, Mark and Radford, Alec and Child, Rewon and Wu, Jeffrey and Jun, Heewoo and Luan, David and Sutskever, Ilya} }
@article{ title = {3D Object Detection with Pointformer}, type = {article}, year = {2020}, websites = {https://arxiv.org/abs/2012.11409v3}, month = {12}, day = {21}, id = {31c7149c-fc97-3e0b-b076-c48bd0a47a6b}, created = {2021-08-31T11:13:29.261Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-26T08:55:41.571Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4,11276190-b8fe-4c3a-a42f-f604438ad4db,d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Feature learning for 3D object detection from point clouds is very challenging due to the irregularity of 3D point cloud data. In this paper, we propose Pointformer, a Transformer backbone designed for 3D point clouds to learn features effectively. Specifically, a Local Transformer module is employed to model interactions among points in a local region, which learns context-dependent region features at an object level. A Global Transformer is designed to learn context-aware representations at the scene level. To further capture the dependencies among multi-scale representations, we propose Local-Global Transformer to integrate local features with global features from higher resolution. In addition, we introduce an efficient coordinate refinement module to shift down-sampled points closer to object centroids, which improves object proposal generation. We use Pointformer as the backbone for state-of-the-art object detection models and demonstrate significant improvements over original models on both indoor and outdoor datasets.}, bibtype = {article}, author = {Pan, Xuran and Xia, Zhuofan and Song, Shiji and Li, Li Erran and Huang, Gao} }
@article{ title = {Deep Learning on Graphs: A Survey}, type = {article}, year = {2020}, pages = {1-1}, volume = {14}, publisher = {IEEE}, id = {dd602ea0-9f3c-3a5d-8cf3-9aa68dfe5112}, created = {2021-09-01T07:41:55.430Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-02T05:16:22.771Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, abstract = {Deep learning has been shown to be successful in a number of domains, ranging from acoustics, images, to natural language processing. However, applying deep learning to the ubiquitous graph data is non-trivial because of the unique characteristics of graphs. Recently, substantial research efforts have been devoted to applying deep learning methods to graphs, resulting in beneficial advances in graph analysis techniques. In this survey, we comprehensively review the different types of deep learning methods on graphs. We divide the existing methods into five categories based on their model architectures and training strategies: graph recurrent neural networks, graph convolutional networks, graph autoencoders, graph reinforcement learning, and graph adversarial methods. We then provide a comprehensive overview of these methods in a systematic manner mainly by following their development history. We also analyze the differences and compositions of different methods. Finally, we briefly outline the applications in which they have been used and discuss potential future research directions.}, bibtype = {article}, author = {Zhang, Ziwei and Cui, Peng and Zhu, Wenwu}, doi = {10.1109/tkde.2020.2981333}, journal = {IEEE Transactions on Knowledge and Data Engineering}, number = {8} }
@article{ title = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, type = {article}, year = {2020}, websites = {https://arxiv.org/abs/2010.11929v2}, month = {10}, day = {22}, id = {6dada552-482a-3081-a312-e28845e54a2d}, created = {2021-09-01T08:02:18.175Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-01T08:02:21.622Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.}, bibtype = {article}, author = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil} }
@article{ title = {Training data-efficient image transformers & distillation through attention}, type = {article}, year = {2020}, websites = {https://arxiv.org/abs/2012.12877v2}, month = {12}, day = {23}, id = {c15965c9-166e-3450-b303-8432457da3e9}, created = {2021-09-01T08:04:33.092Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-14T08:34:18.980Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Recently, neural networks purely based on attention were shown to address image understanding tasks such as image classification. However, these visual transformers are pre-trained with hundreds of millions of images using an expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation token ensuring that the student learns from the teacher through attention. We show the interest of this token-based distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and models.}, bibtype = {article}, author = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jégou, Hervé} }
@article{ title = {Exploring Self-attention for Image Recognition}, type = {article}, year = {2020}, pages = {10073-10082}, websites = {https://arxiv.org/abs/2004.13621v1}, month = {4}, publisher = {IEEE Computer Society}, day = {28}, id = {07fb783d-811e-33f8-b6cb-5cd194b5b981}, created = {2021-09-01T08:07:33.537Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T07:14:33.927Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Recent work has shown that self-attention can serve as a basic building block for image recognition models. We explore variations of self-attention and assess their effectiveness for image recognition. We consider two forms of self-attention. One is pairwise self-attention, which generalizes standard dot-product attention and is fundamentally a set operator. The other is patchwise self-attention, which is strictly more powerful than convolution. Our pairwise self-attention networks match or outperform their convolutional counterparts, and the patchwise models substantially outperform the convolutional baselines. We also conduct experiments that probe the robustness of learned representations and conclude that self-attention networks may have significant benefits in terms of robustness and generalization.}, bibtype = {article}, author = {Zhao, Hengshuang and Jia, Jiaya and Koltun, Vladlen}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Grid-GCN for fast and scalable point cloud learning}, type = {article}, year = {2020}, pages = {5660-5669}, id = {07f2333c-93ca-3641-8cd6-92a251619cf4}, created = {2021-09-02T06:33:40.755Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:21.601Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Due to the sparsity and irregularity of the point cloud data, methods that directly consume points have become popular. Among all point-based models, graph convolutional networks (GCN) lead to notable performance by fully preserving the data granularity and exploiting point interrelation. However, point-based networks spend a significant amount of time on data structuring (e.g., Farthest Point Sampling (FPS) and neighbor points querying), which limit the speed and scalability. In this paper, we present a method, named Grid-GCN, for fast and scalable point cloud learning. Grid-GCN uses a novel data structuring strategy, Coverage-Aware Grid Query (CAGQ). By leveraging the efficiency of grid space, CAGQ improves spatial coverage while reducing the theoretical time complexity. Compared with popular sampling methods such as Farthest Point Sampling (FPS) and Ball Query, CAGQ achieves up to 50× speed-up. With a Grid Context Aggregation (GCA) module, Grid-GCN achieves state-of-the-art performance on major point cloud classification and segmentation benchmarks with significantly faster runtime than previous studies. Remarkably, Grid-GCN achieves the inference speed of 50fps on ScanNet using 81920 points as input. The supplementary 1 and the code 2 are released.}, bibtype = {article}, author = {Xu, Qiangeng and Sun, Xudong and Wu, Cho Ying and Wang, Panqu and Neumann, Ulrich}, doi = {10.1109/CVPR42600.2020.00570}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {A Closer Look at Local Aggregation Operators in Point Cloud Analysis}, type = {article}, year = {2020}, keywords = {3D point cloud,Local aggregation operator,Position pooling}, pages = {326-342}, volume = {12368 LNCS}, id = {0417b047-7758-36a9-85e4-b564d9ac6b76}, created = {2021-09-03T06:58:15.580Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T06:58:18.767Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307}, private_publication = {false}, abstract = {Recent advances of network architecture for point cloud processing are mainly driven by new designs of local aggregation operators. However, the impact of these operators to network performance is not carefully investigated due to different overall network architecture and implementation details in each solution. Meanwhile, most of operators are only applied in shallow architectures. In this paper, we revisit the representative local aggregation operators and study their performance using the same deep residual architecture. Our investigation reveals that despite the different designs of these operators, all of these operators make surprisingly similar contributions to the network performance under the same network input and feature numbers and result in the state-of-the-art accuracy on standard benchmarks. This finding stimulate us to rethink the necessity of sophisticated design of local aggregation operator for point cloud processing. To this end, we propose a simple local aggregation operator without learnable weights, named Position Pooling (PosPool), which performs similarly or slightly better than existing sophisticated operators. In particular, a simple deep residual network with PosPool layers achieves outstanding performance on all benchmarks, which outperforms the previous state-of-the methods on the challenging PartNet datasets by a large margin (7.4 mIoU). The code is publicly available at https://github.com/zeliu98/CloserLook3D.}, bibtype = {article}, author = {Liu, Ze and Hu, Han and Cao, Yue and Zhang, Zheng and Tong, Xin}, doi = {10.1007/978-3-030-58592-1_20}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention}, type = {article}, year = {2020}, pages = {5112-5121}, volume = {PartF168147-7}, websites = {https://arxiv.org/abs/2006.16236v3}, month = {6}, publisher = {International Machine Learning Society (IMLS)}, day = {29}, id = {6edabdcb-7bb6-36eb-a59b-a0a306480bd3}, created = {2021-09-03T07:04:36.327Z}, accessed = {2021-09-03}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T07:04:39.222Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Transformers achieve remarkable performance in several tasks but due to their quadratic complexity, with respect to the input's length, they are prohibitively slow for very long sequences. To address this limitation, we express the self-attention as a linear dot-product of kernel feature maps and make use of the associativity property of matrix products to reduce the complexity from $\mathcalO\left(N^2\right)$ to $\mathcalO\left(N\right)$, where $N$ is the sequence length. We show that this formulation permits an iterative implementation that dramatically accelerates autoregressive transformers and reveals their relationship to recurrent neural networks. Our linear transformers achieve similar performance to vanilla transformers and they are up to 4000x faster on autoregressive prediction of very long sequences.}, bibtype = {article}, author = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, François}, journal = {37th International Conference on Machine Learning, ICML 2020} }
@article{ title = {Rethinking Attention with Performers}, type = {article}, year = {2020}, websites = {https://arxiv.org/abs/2009.14794v1}, month = {9}, day = {30}, id = {673dd59c-c661-3214-8d0c-06a6b125faae}, created = {2021-09-03T07:05:05.751Z}, accessed = {2021-09-03}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T07:05:10.577Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {We introduce Performers, Transformer architectures which can estimate regular (softmax) full-rank-attention Transformers with provable accuracy, but using only linear (as opposed to quadratic) space and time complexity, without relying on any priors such as sparsity or low-rankness. To approximate softmax attention-kernels, Performers use a novel Fast Attention Via positive Orthogonal Random features approach (FAVOR+), which may be of independent interest for scalable kernel methods. FAVOR+ can be also used to efficiently model kernelizable attention mechanisms beyond softmax. This representational power is crucial to accurately compare softmax with other kernels for the first time on large-scale tasks, beyond the reach of regular Transformers, and investigate optimal attention-kernels. Performers are linear architectures fully compatible with regular Transformers and with strong theoretical guarantees: unbiased or nearly-unbiased estimation of the attention matrix, uniform convergence and low estimation variance. We tested Performers on a rich set of tasks stretching from pixel-prediction through text models to protein sequence modeling. We demonstrate competitive results with other examined efficient sparse and dense attention methods, showcasing effectiveness of the novel attention-learning paradigm leveraged by Performers.}, bibtype = {article}, author = {Choromanski, Krzysztof and Likhosherstov, Valerii and Dohan, David and Song, Xingyou and Gane, Andreea and Sarlos, Tamas and Hawkins, Peter and Davis, Jared and Mohiuddin, Afroz and Kaiser, Lukasz and Belanger, David and Colwell, Lucy and Weller, Adrian} }
@article{ title = {Big Bird: Transformers for Longer Sequences}, type = {article}, year = {2020}, volume = {2020-December}, websites = {https://arxiv.org/abs/2007.14062v2}, month = {7}, publisher = {Neural information processing systems foundation}, day = {28}, id = {ca4522d1-6726-3e6a-907a-4d92af13c20e}, created = {2021-09-06T11:04:39.854Z}, accessed = {2021-09-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T11:04:42.451Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP. Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our theoretical analysis reveals some of the benefits of having $O(1)$ global tokens (such as CLS), that attend to the entire sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to 8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context, BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also propose novel applications to genomics data.}, bibtype = {article}, author = {Zaheer, Manzil and Guruganesh, Guru and Dubey, Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and Ahmed, Amr}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Linformer: Self-Attention with Linear Complexity}, type = {article}, year = {2020}, websites = {https://arxiv.org/abs/2006.04768v2}, month = {6}, day = {8}, id = {c89abaf1-8f89-35e8-ac7d-4fcfd730c720}, created = {2021-09-06T11:05:48.797Z}, accessed = {2021-09-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T11:05:51.413Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Large transformer models have shown extraordinary success in achieving state-of-the-art results in many natural language processing applications. However, training and deploying these models can be prohibitively costly for long sequences, as the standard self-attention mechanism of the Transformer uses $O(n^2)$ time and space with respect to sequence length. In this paper, we demonstrate that the self-attention mechanism can be approximated by a low-rank matrix. We further exploit this finding to propose a new self-attention mechanism, which reduces the overall self-attention complexity from $O(n^2)$ to $O(n)$ in both time and space. The resulting linear transformer, the \textitLinformer, performs on par with standard Transformer models, while being much more memory- and time-efficient.}, bibtype = {article}, author = {Wang, Sinong and Li, Belinda Z. and Khabsa, Madian and Fang, Han and Ma, Hao} }
@article{ title = {Spherical Kernel for Efficient Graph Convolution on 3D Point Clouds}, type = {article}, year = {2020}, pages = {3664-3680}, volume = {43}, publisher = {IEEE}, id = {eb6cc7a5-c1d4-3560-a8cd-4b1c8a72e133}, created = {2021-09-07T08:57:34.492Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-04T07:33:03.119Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307,f8d4d36f-8136-4a85-8d1a-ceaffb92ddf1}, private_publication = {false}, abstract = {We propose a spherical kernel for efficient graph convolution of 3D point clouds. Our metric-based kernels systematically quantize the local 3D space to identify distinctive geometric relationships in the data. Similar to the regular grid CNN kernels, the spherical kernel maintains translation-invariance and asymmetry properties, where the former guarantees weight sharing among similar local structures in the data and the latter facilitates fine geometric learning. The proposed kernel is applied to graph neural networks without edge-dependent filter generation, making it computationally attractive for large point clouds. In our graph networks, each vertex is associated with a single point location and edges connect the neighborhood points within a defined range. The graph gets coarsened in the network with farthest point sampling. Analogous to the standard CNNs, we define pooling and unpooling operations for our network. We demonstrate the effectiveness of the proposed spherical kernel with graph neural networks for point cloud classification and semantic segmentation using ModelNet, ShapeNet, RueMonge2014, ScanNet and S3DIS datasets. The source code and the trained models can be downloaded from https://github.com/hlei-ziyan/SPH3D-GCN.}, bibtype = {article}, author = {Lei, Huan and Akhtar, Naveed and Mian, Ajmal}, doi = {10.1109/tpami.2020.2983410}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {10} }
@article{ title = {Sparse Sinkhorn Attention}, type = {article}, year = {2020}, pages = {9380-9389}, volume = {PartF168147-13}, websites = {https://arxiv.org/abs/2002.11296v1}, month = {2}, publisher = {International Machine Learning Society (IMLS)}, day = {26}, id = {427e79a8-5db3-33d4-a9f3-b83f4639f989}, created = {2021-09-07T12:03:21.611Z}, accessed = {2021-09-07}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-07T12:03:24.018Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {We propose Sparse Sinkhorn Attention, a new efficient and sparse method for learning to attend. Our method is based on differentiable sorting of internal representations. Concretely, we introduce a meta sorting network that learns to generate latent permutations over sequences. Given sorted sequences, we are then able to compute quasi-global attention with only local windows, improving the memory efficiency of the attention module. To this end, we propose new algorithmic innovations such as Causal Sinkhorn Balancing and SortCut, a dynamic sequence truncation method for tailoring Sinkhorn Attention for encoding and/or decoding purposes. Via extensive experiments on algorithmic seq2seq sorting, language modeling, pixel-wise image generation, document classification and natural language inference, we demonstrate that our memory efficient Sinkhorn Attention method is competitive with vanilla attention and consistently outperforms recently proposed efficient Transformer models such as Sparse Transformers.}, bibtype = {article}, author = {Tay, Yi and Bahri, Dara and Yang, Liu and Metzler, Donald and Juan, Da-Cheng}, journal = {37th International Conference on Machine Learning, ICML 2020} }
@article{ title = {A Long Horizon Planning Framework for Manipulating Rigid Pointcloud Objects}, type = {article}, year = {2020}, keywords = {learning,manipulation,planning}, pages = {1-20}, websites = {http://arxiv.org/abs/2011.08177}, id = {081f19c5-977d-34d6-84fc-e447d3d2c2a6}, created = {2021-09-16T07:10:13.950Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-28T07:20:31.573Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8}, private_publication = {false}, abstract = {We present a framework for solving long-horizon planning problems involving manipulation of rigid objects that operates directly from a point-cloud observation, i.e. without prior object models. Our method plans in the space of object subgoals and frees the planner from reasoning about robot-object interaction dynamics by relying on a set of generalizable manipulation primitives. We show that for rigid bodies, this abstraction can be realized using low-level manipulation skills that maintain sticking contact with the object and represent subgoals as 3D transformations. To enable generalization to unseen objects and improve planning performance, we propose a novel way of representing subgoals for rigid-body manipulation and a graph-attention based neural network architecture for processing point-cloud inputs. We experimentally validate these choices using simulated and real-world experiments on the YuMi robot. Results demonstrate that our method can successfully manipulate new objects into target configurations requiring long-term planning. Overall, our framework realizes the best of the worlds of task-and-motion planning (TAMP) and learning-based approaches. Project website: https://anthonysimeonov.github.io/rpo-planning-framework/.}, bibtype = {article}, author = {Simeonov, Anthony and Du, Yilun and Kim, Beomjoon and Hogan, Francois R. and Tenenbaum, Joshua and Agrawal, Pulkit and Rodriguez, Alberto}, number = {CoRL} }
@article{ title = {Cross-regional attention network for point cloud completion}, type = {article}, year = {2020}, keywords = {3D vision,Cross-regional attention,Deep learning,Point cloud completion}, pages = {885-892}, id = {ee4d3234-279c-34d6-a2e0-bc2f6e8828f5}, created = {2021-09-16T07:10:14.118Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-22T05:39:41.554Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8,f8d4d36f-8136-4a85-8d1a-ceaffb92ddf1}, private_publication = {false}, abstract = {Point clouds generated from real-world scanning are always incomplete and ununiformly distributed, which would cause structural losses in 3D shape representations. Therefore, a learning-based method is introduced in this paper to repair partial point clouds and restore complete shapes of target objects. First, we sample several local regions of inputs, encode their features and fuse them with independently extracted global features. Second, we establish a graph to connect all local features together, and then implement convolution with multi-head attention on the graph. Graph attention mechanism enables each local feature vector to search across the regions and selectively absorb other local features based on their relationships in high-dimensional feature space. Third, we design a coarse decoder to collect cross-region features from the graph and generate skeletons of complete point clouds, and a folding-based decoder is leveraged to generate final point clouds with high resolution. Our network is trained on six categories of objects from the ModelNet dataset, its performance is compared with several existing methods, the results show that our network is able to generate dense complete point cloud with the highest accuracy.}, bibtype = {article}, author = {Wu, Hang and Miao, Yubin}, doi = {10.1109/ICPR48806.2021.9413104}, journal = {Proceedings - International Conference on Pattern Recognition} }
@article{ title = {Exploring Deep 3D Spatial Encodings for Large-Scale 3D Scene Understanding}, type = {article}, year = {2020}, pages = {1-5}, websites = {http://arxiv.org/abs/2011.14358}, id = {3336ec3f-68ae-3b21-a9c4-019dfd204612}, created = {2021-09-21T13:25:20.279Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:28.631Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Khan2020}, private_publication = {false}, abstract = {Semantic segmentation of raw 3D point clouds is an essential component in 3D scene analysis, but it poses several challenges, primarily due to the non-Euclidean nature of 3D point clouds. Although, several deep learning based approaches have been proposed to address this task, but almost all of them emphasized on using the latent (global) feature representations from traditional convolutional neural networks (CNN), resulting in severe loss of spatial information, thus failing to model the geometry of the underlying 3D objects, that plays an important role in remote sensing 3D scenes. In this letter, we have proposed an alternative approach to overcome the limitations of CNN based approaches by encoding the spatial features of raw 3D point clouds into undirected symmetrical graph models. These encodings are then combined with a high-dimensional feature vector extracted from a traditional CNN into a localized graph convolution operator that outputs the required 3D segmentation map. We have performed experiments on two standard benchmark datasets (including an outdoor aerial remote sensing dataset and an indoor synthetic dataset). The proposed method achieves on par state-of-the-art accuracy with improved training time and model stability thus indicating strong potential for further research towards a generalized state-of-the-art method for 3D scene understanding.}, bibtype = {article}, author = {Khan, Saqib Ali and Shi, Yilei and Shahzad, Muhammad and Zhu, Xiao Xiang}, number = {October} }
@article{ title = {3D hand pose estimation with disentangled cross-modal latent space}, type = {article}, year = {2020}, pages = {380-389}, id = {6d326b0b-254e-380a-966d-33a0aec70a37}, created = {2021-09-29T10:16:08.835Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.240Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Gu2020}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Estimating 3D hand pose from a single RGB image is a challenging task because of its ill-posed nature (i.e., depth ambiguity). Recently, various generative approaches have been proposed to predict the 3D joints of an RGB hand image by learning a unified latent space between two modalities (i.e., RGB image and 3D joints). However, projecting multi-modal data (i.e., RGB images and 3D joints) into a unified latent space is difficult as the modality-specific features usually interfere the learning of the optimal latent space. Hence in this paper, we propose to disentangle the latent space into two sub-latent spaces: modality- specific latent space and pose-specific latent space for 3D hand pose estimation. Our proposed method, namely Disentangled Cross-Modal Latent Space (DCMLS), consists of two variational autoencoder networks and auxiliary components which connect the two VAEs to align underlying hand poses and transfer modality-specific context from RGB to 3D. For the hand pose latent space, we align it with the two modalities by using a cross-modal discriminator with an adversarial learning strategy. For the context latent space, we learn a context translator to gain access to the cross-modal context. Experimental results on two widely used public benchmark datasets RHD and STB demonstrate that our proposed DCMLS method is able to clearly outperform the state-of-the-art ones on single image based 3D hand pose estimation.}, bibtype = {article}, author = {Gu, Jiajun and Wang, Zhiyong and Ouyang, Wanli and Zhang, Weichen and Li, Jiafeng and Zhuo, Li}, doi = {10.1109/WACV45572.2020.9093316}, journal = {Proceedings - 2020 IEEE Winter Conference on Applications of Computer Vision, WACV 2020} }
@article{ title = {Learning to dress 3D people in generative clothing}, type = {article}, year = {2020}, pages = {6468-6477}, id = {d7183ce9-c2a2-389b-9cc2-e524e02c97b1}, created = {2021-09-29T10:16:08.859Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.635Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ma2020}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Three-dimensional human body models are widely used in the analysis of human pose and motion. Existing models, however, are learned from minimally-clothed 3D scans and thus do not generalize to the complexity of dressed people in common images and videos. Additionally, current models lack the expressive power needed to represent the complex non-linear geometry of pose-dependent clothing shapes. To address this, we learn a generative 3D mesh model of clothed people from 3D scans with varying pose and clothing. Specifically, we train a conditional Mesh-VAE-GAN to learn the clothing deformation from the SMPL body model, making clothing an additional term in SMPL. Our model is conditioned on both pose and clothing type, giving the ability to draw samples of clothing to dress different body shapes in a variety of styles and poses. To preserve wrinkle detail, our Mesh-VAE-GAN extends patchwise discriminators to 3D meshes. Our model, named CAPE, represents global shape and fine local structure, effectively extending the SMPL body model to clothing. To our knowledge, this is the first generative model that directly dresses 3D human body meshes and generalizes to different poses. The model, code and data are available for research purposes at https://cape.is.tue.mpg.de.}, bibtype = {article}, author = {Ma, Qianli and Yang, Jinlong and Ranjan, Anurag and Pujades, Sergi and Pons-Moll, Gerard and Tang, Siyu and Black, Michael J.}, doi = {10.1109/CVPR42600.2020.00650}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Private-Shared Disentangled Multimodal VAE for Learning of Hybrid Latent Representations}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2012.13024}, id = {ba831c4a-78d6-3085-9dbd-dc67ea96e890}, created = {2021-09-29T10:16:08.971Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.783Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lee2020}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434}, private_publication = {false}, abstract = {Multi-modal generative models represent an important family of deep models, whose goal is to facilitate representation learning on data with multiple views or modalities. However, current deep multi-modal models focus on the inference of shared representations, while neglecting the important private aspects of data within individual modalities. In this paper, we introduce a disentangled multi-modal variational autoencoder (DMVAE) that utilizes disentangled VAE strategy to separate the private and shared latent spaces of multiple modalities. We specifically consider the instance where the latent factor may be of both continuous and discrete nature, leading to the family of general hybrid DMVAE models. We demonstrate the utility of DMVAE on a semi-supervised learning task, where one of the modalities contains partial data labels, both relevant and irrelevant to the other modality. Our experiments on several benchmarks indicate the importance of the private-shared disentanglement as well as the hybrid latent representation.}, bibtype = {article}, author = {Lee, Mihee and Pavlovic, Vladimir}, doi = {10.1109/cvprw53098.2021.00185} }
@article{ title = {Quantifying the Generative Capabilities of Variational Autoencoders for 3D Car Point Clouds}, type = {article}, year = {2020}, keywords = {Representation learning,generative model,geometric deep learning,novelty,point clouds}, pages = {1469-1477}, id = {1e8d492e-51a9-33f6-9047-160a1789cd98}, created = {2021-09-30T06:29:03.581Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.522Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Saha2020}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8efc2fe0-ed07-4348-a865-9f1a22b45934,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {During each cycle of automotive development, large amounts of geometric data are generated as results of design studies and simulation tasks. Discovering hidden knowledge from this data and making it available to the development team strengthens the design process by utilizing historic information when creating novel products. To this end, we propose to use powerful geometric deep learning models that learn lowdimensional representation of the design data in an unsupervised fashion. Trained models allow to efficiently explore the design space, as well as to generate novel designs. One popular class of generative models are variational autoencoders, which have however been rarely applied to geometric data. Hence, we use a variational autoencoder for 3D point clouds (PC-VAE) and explore the model's generative capabilities with a focus on the generation of realistic yet novel 3D shapes. We apply the PC-VAE to point clouds sampled from car shapes from a benchmark data set and employ quantitative measures to show that our PC-VAE generates realistic car shapes, wile returning a richer variety of unseen shapes compared to a baseline autoencoder. Finally, we demonstrate how the PC-VAE can be guided towards generating shapes with desired target properties by optimizing the parameters that maximize the output of a trained classifier for said target properties. We conclude that generative models are a powerful tool that may aid designers in automotive product development.}, bibtype = {article}, author = {Saha, Sneha and Menzel, Stefan and Minku, Leandro L. and Yao, Xin and Sendhoff, Bernhard and Wollstadt, Patricia}, doi = {10.1109/SSCI47803.2020.9308513}, journal = {2020 IEEE Symposium Series on Computational Intelligence, SSCI 2020} }
@article{ title = {Mesh variational autoencoders with edge contraction pooling}, type = {article}, year = {2020}, pages = {1105-1112}, volume = {2020-June}, id = {c06ac106-7cc4-3b24-8c22-915065ca9435}, created = {2021-09-30T06:31:40.118Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.124Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yuan2020}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {3D shape analysis is an important research topic in computer vision and graphics. While existing methods have generalized image-based deep learning to meshes using graph-based convolutions, the lack of an effective pooling operation restricts the learning capability of their networks. In this paper, we propose a novel pooling operation for mesh datasets with the same connectivity but different geometry, by building a mesh hierarchy using mesh simplification. For this purpose, we develop a modified mesh simplification method to avoid generating highly irregularly sized triangles. Our pooling operation effectively encodes the correspondence between coarser and finer meshes in the hierarchy. We then present a variational auto-encoder (VAE) structure with the edge contraction pooling and graph-based convolutions, to explore probability latent spaces of 3D surfaces and perform 3D shape generation. Our network requires far fewer parameters than the original mesh VAE and thus can handle denser models thanks to our new pooling operation and convolutional kernels. Our evaluation also shows that our method has better generalization ability and is more reliable in various applications, including shape generation and shape interpolation.}, bibtype = {article}, author = {Yuan, Yu Jie and Lai, Yu Kun and Yang, Jie and Duan, Qi and Fu, Hongbo and Gao, Lin}, doi = {10.1109/CVPRW50498.2020.00145}, journal = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops} }
@article{ title = {End-to-End Human Pose and Mesh Reconstruction with Transformers}, type = {article}, year = {2020}, pages = {1954-1963}, websites = {http://arxiv.org/abs/2012.09760}, id = {09087766-63d4-3a09-93b7-99fd86ac59c5}, created = {2021-09-30T06:32:49.964Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-30T06:32:52.856Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {11276190-b8fe-4c3a-a42f-f604438ad4db}, private_publication = {false}, abstract = {We present a new method, called MEsh TRansfOrmer (METRO), to reconstruct 3D human pose and mesh vertices from a single image. Our method uses a transformer encoder to jointly model vertex-vertex and vertex-joint interactions, and outputs 3D joint coordinates and mesh vertices simultaneously. Compared to existing techniques that regress pose and shape parameters, METRO does not rely on any parametric mesh models like SMPL, thus it can be easily extended to other objects such as hands. We further relax the mesh topology and allow the transformer self-attention mechanism to freely attend between any two vertices, making it possible to learn non-local relationships among mesh vertices and joints. With the proposed masked vertex modeling, our method is more robust and effective in handling challenging situations like partial occlusions. METRO generates new state-of-the-art results for human mesh reconstruction on the public Human3.6M and 3DPW datasets. Moreover, we demonstrate the generalizability of METRO to 3D hand reconstruction in the wild, outperforming existing state-of-the-art methods on FreiHAND dataset. Code and pre-trained models are available at https://github.com/microsoft/MeshTransformer.}, bibtype = {article}, author = {Lin, Kevin and Wang, Lijuan and Liu, Zicheng} }
@article{ title = {Statistically Consistent Saliency Estimation}, type = {article}, year = {2020}, pages = {745-753}, id = {2e203a8b-664a-30f6-bfe3-96038141dc58}, created = {2021-10-13T14:40:10.776Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:54.298Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {The use of deep learning for a wide range of data problems has increased the need for understanding and diagnosing these models, and deep learning interpretation techniques have become an essential tool for data analysts. Although numerous model interpretation methods have been proposed in recent years, most of these procedures are based on heuristics with little or no theoretical guarantees. In this work, we propose a statistical framework for saliency estimation for black box computer vision models. We build a model-agnostic estimation procedure that is statistically consistent and passes the saliency checks of Adebayo et al. (2018b). Our method requires solving a linear program, whose solution can be efficiently computed in polynomial time. Through our theoretical analysis, we establish an upper bound on the number of model evaluations needed to recover the region of importance with high probability, and build a new perturbation scheme for estimation of local gradients that is shown to be more efficient than the commonly used random perturbation schemes. Validity of the new method is demonstrated through sensitivity analysis.}, bibtype = {article}, author = {Anonymous authors, undefined} }
@article{ title = {MVTN: Multi-View Transformation Network for 3D Shape Recognition}, type = {article}, year = {2020}, pages = {1-11}, websites = {http://arxiv.org/abs/2011.13244}, id = {889fc036-b4d3-31cf-ad59-ac0412b2f925}, created = {2021-10-13T14:40:10.994Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:32.188Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Multi-view projection methods have demonstrated their ability to reach state-of-the-art performance on 3D shape recognition. Those methods learn different ways to aggregate information from multiple views. However, the camera view-points for those views tend to be heuristically set and fixed for all shapes. To circumvent the lack of dynamism of current multi-view methods, we propose to learn those view-points. In particular, we introduce the Multi-View Transformation Network (MVTN) that regresses optimal view-points for 3D shape recognition, building upon advances in differentiable rendering. As a result, MVTN can be trained end-to-end along with any multi-view network for 3D shape classification. We integrate MVTN in a novel adaptive multi-view pipeline that can render either 3D meshes or point clouds. MVTN exhibits clear performance gains in the tasks of 3D shape classification and 3D shape retrieval without the need for extra training supervision. In these tasks, MVTN achieves state-of-the-art performance on ModelNet40, ShapeNet Core55, and the most recent and realistic ScanObjectNN dataset (up to 6% improvement). Interestingly, we also show that MVTN can provide network robustness against rotation and occlusion in the 3D domain. The code is available at https://github.com/ajhamdi/MVTN .}, bibtype = {article}, author = {Hamdi, Abdullah and Giancola, Silvio and Ghanem, Bernard} }
@article{ title = {Point Transformer}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2012.09164}, id = {9b691b3a-546a-34f2-ab11-ca9c9e6d454a}, created = {2021-10-13T14:40:11.118Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:52.777Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0}, private_publication = {false}, abstract = {Self-attention networks have revolutionized natural language processing and are making impressive strides in image analysis tasks such as image classification and object detection. Inspired by this success, we investigate the application of self-attention networks to 3D point cloud processing. We design self-attention layers for point clouds and use these to construct self-attention networks for tasks such as semantic scene segmentation, object part segmentation, and object classification. Our Point Transformer design improves upon prior work across domains and tasks. For example, on the challenging S3DIS dataset for large-scale semantic scene segmentation, the Point Transformer attains an mIoU of 70.4% on Area 5, outperforming the strongest prior model by 3.3 absolute percentage points and crossing the 70% mIoU threshold for the first time.}, bibtype = {article}, author = {Zhao, Hengshuang and Jiang, Li and Jia, Jiaya and Torr, Philip and Koltun, Vladlen}, doi = {10.1109/access.2021.3116304} }
@article{ title = {Omni-GAN: On the Secrets of cGANs and Beyond}, type = {article}, year = {2020}, pages = {14061-14071}, websites = {http://arxiv.org/abs/2011.13074}, id = {5b9e7b47-d1a5-336d-9e6c-e9a4c0f1a146}, created = {2021-10-13T14:40:11.159Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:19.037Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {The conditional generative adversarial network (cGAN) is a powerful tool of generating high-quality images, but existing approaches mostly suffer unsatisfying performance or the risk of mode collapse. This paper presents Omni-GAN, a variant of cGAN that reveals the devil in designing a proper discriminator for training the model. The key is to ensure that the discriminator receives strong supervision to perceive the concepts and moderate regularization to avoid collapse. Omni-GAN is easily implemented and freely integrated with off-the-shelf encoding methods (e.g., implicit neural representation, INR). Experiments validate the superior performance of Omni-GAN and Omni-INR-GAN in a wide range of image generation and restoration tasks. In particular, Omni-INR-GAN sets new records on the ImageNet dataset with impressive Inception scores of 262.85 and 343.22 for the image sizes of 128 and 256, respectively, surpassing the previous records by 100+ points. Moreover, leveraging the generator prior, Omni-INR-GAN can extrapolate low-resolution images to arbitrary resolution, even up to x60+ higher resolution. Code is available.}, bibtype = {article}, author = {Zhou, Peng and Xie, Lingxi and Ni, Bingbing and Geng, Cong and Tian, Qi} }
@article{ title = {Unsupervised Segmentation incorporating Shape Prior via Generative Adversarial Networks}, type = {article}, year = {2020}, pages = {7324-7334}, id = {42da2857-7331-3ba9-9c61-419ac06f6b76}, created = {2021-10-13T14:40:11.339Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:23.153Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Author, Anonymous and Address, Affiliation}, number = {NeurIPS} }
@article{ title = {Minimal Adversarial Examples for Deep Learning on 3D Point Clouds}, type = {article}, year = {2020}, pages = {7797-7806}, websites = {http://arxiv.org/abs/2008.12066}, id = {33a20f57-853c-3e73-ae67-174da8c819e3}, created = {2021-10-13T14:40:11.989Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:07.378Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kim2020}, folder_uuids = {ca5c76e2-2545-4aa6-aaff-2b57ed20c145,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b,db36ed60-3b58-424a-b9a4-a9c7322975f3}, private_publication = {false}, abstract = {With recent developments of convolutional neural networks, deep learning for 3D point clouds has shown significant progress in various 3D scene understanding tasks, e.g., object recognition, object detection. In a safety-critical environment, it is however not well understood how such deep learning models are vulnerable to adversarial examples. In this work, we explore adversarial attacks for point cloud-based neural networks. We propose a general formulation for adversarial point cloud generation via $\ell_0$-norm optimisation. Our method generates adversarial examples by attacking the classification ability of the point cloud-based networks while considering the perceptibility of the examples and ensuring the minimum level of point manipulations. The proposed method is general and can be realised in different attack strategies. Experimental results show that our method achieves the state-of-the-art performance with higher than 89% and 90% of attack success on synthetic and real-world data respectively, while manipulating only about 4% of the total points.}, bibtype = {article}, author = {Kim, Jaeyeon and Hua, Binh-Son and Nguyen, Duc Thanh and Yeung, Sai-Kit} }
@article{ title = {Explaining Local, Global, And Higher-Order Interactions In Deep Learning}, type = {article}, year = {2020}, pages = {1224-1233}, websites = {http://arxiv.org/abs/2006.08601}, id = {6f42b36d-4aa7-3431-b752-3386d553fa15}, created = {2021-10-13T14:40:12.391Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:30.249Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We present a simple yet highly generalizable method for explaining interacting parts within a neural network's reasoning process. First, we design an algorithm based on cross derivatives for computing statistical interaction effects between individual features, which is generalized to both 2-way and higher-order (3-way or more) interactions. We present results side by side with a weight-based attribution technique, corroborating that cross derivatives are a superior metric for both 2-way and higher-order interaction detection. Moreover, we extend the use of cross derivatives as an explanatory device in neural networks to the computer vision setting by expanding Grad-CAM, a popular gradient-based explanatory tool for CNNs, to the higher order. While Grad-CAM can only explain the importance of individual objects in images, our method, which we call Taylor-CAM, can explain a neural network's relational reasoning across multiple objects. We show the success of our explanations both qualitatively and quantitatively, including with a user study. We will release all code as a tool package to facilitate explainable deep learning.}, bibtype = {article}, author = {Lerman, Samuel and Xu, Chenliang and Venuto, Charles and Kautz, Henry} }
@article{ title = {Layout Generation and Completion with Self-attention}, type = {article}, year = {2020}, pages = {1004-1014}, websites = {http://arxiv.org/abs/2006.14615}, id = {ef30677d-b3d1-3550-a89a-6516f9c4d922}, created = {2021-10-13T14:40:12.439Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:30.588Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We address the problem of layout generation for diverse domains such as images, documents, and mobile applications. A layout is a set of graphical elements, belonging to one or more categories, placed together in a meaningful way. Generating a new layout or extending an existing layout requires understanding the relationships between these graphical elements. To do this, we propose a novel framework, LayoutTransformer, that leverages a self-attention based approach to learn contextual relationships between layout elements and generate layouts in a given domain. The proposed model improves upon the state-of-the-art approaches in layout generation in four ways. First, our model can generate a new layout either from an empty set or add more elements to a partial layout starting from an initial set of elements. Second, as the approach is attention-based, we can visualize which previous elements the model is attending to predict the next element, thereby providing an interpretable sequence of layout elements. Third, our model can easily scale to support both a large number of element categories and a large number of elements per layout. Finally, the model also produces an embedding for various element categories, which can be used to explore the relationships between the categories. We demonstrate with experiments that our model can produce meaningful layouts in diverse settings such as object bounding boxes in scenes (COCO bounding boxes), documents (PubLayNet), and mobile applications (RICO dataset).}, bibtype = {article}, author = {Gupta, Kamal and Achille, Alessandro and Lazarow, Justin and Davis, Larry and Mahadevan, Vijay and Shrivastava, Abhinav} }
@article{ title = {FMODetect: Robust Detection and Trajectory Estimation of Fast Moving Objects}, type = {article}, year = {2020}, pages = {3541-3549}, websites = {http://arxiv.org/abs/2012.08216}, id = {859dc78b-fc3a-3c81-b038-948eef6342da}, created = {2021-10-13T14:40:12.520Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:25.524Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We propose the first learning-based approach for detection and trajectory estimation of fast moving objects. Such objects are highly blurred and move over large distances within one video frame. Fast moving objects are associated with a deblurring and matting problem, also called deblatting. Instead of solving the complex deblatting problem jointly, we split the problem into matting and deblurring and solve them separately. The proposed method first detects all fast moving objects as a truncated distance function to the trajectory. Subsequently, a matting and fitting network for each detected object estimates the object trajectory and its blurred appearance without background. For the sharp appearance estimation, we propose an energy minimization based deblurring. The state-of-the-art methods are outperformed in terms of trajectory estimation and sharp appearance reconstruction. Compared to other methods, such as deblatting, the inference is of several orders of magnitude faster and allows applications such as real-time fast moving object detection and retrieval in large video collections.}, bibtype = {article}, author = {Rozumnyi, Denys and Matas, Jiri and Sroubek, Filip and Pollefeys, Marc and Oswald, Martin R.} }
@article{ title = {Are we Missing Confidence in Pseudo-LiDAR Methods for Monocular 3D Object Detection?}, type = {article}, year = {2020}, pages = {3225-3233}, websites = {http://arxiv.org/abs/2012.05796}, id = {cdc30c57-ae13-304e-9284-f0c360dcc628}, created = {2021-10-13T14:40:12.675Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.834Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Pseudo-LiDAR-based methods for monocular 3D object detection have received considerable attention in the community due to the performance gains exhibited on the KITTI3D benchmark, in particular on the commonly reported validation split. This generated a distorted impression about the superiority of Pseudo-LiDAR-based (PL-based) approaches over methods working with RGB images only. Our first contribution consists in rectifying this view by pointing out and showing experimentally that the validation results published by PL-based methods are substantially biased. The source of the bias resides in an overlap between the KITTI3D object detection validation set and the training/validation sets used to train depth predictors feeding PL-based methods. Surprisingly, the bias remains also after geographically removing the overlap. This leaves the test set as the only reliable set for comparison, where published PL-based methods do not excel. Our second contribution brings PL-based methods back up in the ranking with the design of a novel deep architecture which introduces a 3D confidence prediction module. We show that 3D confidence estimation techniques derived from RGB-only 3D detection approaches can be successfully integrated into our framework and, more importantly, that improved performance can be obtained with a newly designed 3D confidence measure, leading to state-of-the-art performance on the KITTI3D benchmark.}, bibtype = {article}, author = {Simonelli, Andrea and Bulò, Samuel Rota and Porzi, Lorenzo and Kontschieder, Peter and Ricci, Elisa} }
@article{ title = {Generalized autoencoder for volumetric shape generation}, type = {article}, year = {2020}, pages = {1082-1088}, volume = {2020-June}, id = {f8cfcf29-8e35-3bd2-b98d-a26cb1eeaa16}, created = {2021-10-25T06:33:57.684Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.314Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Guan2020}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We introduce a 3D generative shape model based on the generalized autoencoder (GAE). GAEs learn a manifold latent space from data relations explicitly provided during training. In our work, we train a GAE for volumetric shape generation from data similarities derived from the Chamfer distance, and with a loss function which is the combination of the traditional autoencoder loss and the GAE loss. We show that this shape model is able to learn more meaningful structures for the latent manifolds of different categories of shapes, and provides better interpolations between shapes when compared to previous approaches such as autoencoders and variational autoencoders.}, bibtype = {article}, author = {Guan, Yanran and Jahan, Tansin and Van Kaick, Oliver}, doi = {10.1109/CVPRW50498.2020.00142}, journal = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops} }
@article{ title = {An Asynchronous Kalman Filter for Hybrid Event Cameras}, type = {article}, year = {2020}, pages = {448-457}, websites = {http://arxiv.org/abs/2012.05590}, id = {61aacb99-9a40-3024-9d6b-6695fca5e41d}, created = {2021-10-30T07:28:03.725Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:45.293Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {Event cameras are ideally suited to capture HDR visual information without blur but perform poorly on static or slowly changing scenes. Conversely, conventional image sensors measure absolute intensity of slowly changing scenes effectively but do poorly on high dynamic range or quickly changing scenes. In this paper, we present an event-based video reconstruction pipeline for High Dynamic Range (HDR) scenarios. The proposed algorithm includes a frame augmentation pre-processing step that deblurs and temporally interpolates frame data using events. The augmented frame and event data are then fused using a novel asynchronous Kalman filter under a unifying uncertainty model for both sensors. Our experimental results are evaluated on both publicly available datasets with challenging lighting conditions and fast motions and our new dataset with HDR reference. The proposed algorithm outperforms state-of-the-art methods in both absolute intensity error (48% reduction) and image similarity indexes (average 11% improvement).}, bibtype = {article}, author = {Wang, Ziwei and Ng, Yonhon and Scheerlinck, Cedric and Mahony, Robert} }
@article{ title = {Dynamic Graph Warping Transformer for Video Alignment}, type = {article}, year = {2020}, id = {f657920c-6c6d-3051-a4ce-a05810271215}, created = {2021-11-23T08:15:32.021Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:15:35.347Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Video alignment aims to match synchronised action information between multiple Video alignment aims to match synchronised action information between multiple video sequences. Existing methods are typically based on supervised learning to align video frames according to annotated action phases. However, such phase-level annotation cannot effectively guide frame-level alignment, since each phase can be completed at different speeds across individuals. In this paper, we introduce dynamic warping to take between-video information into account with a new Dynamic Graph Warping Transformer (DGWT) network model. Our approach is the first Graph Transformer framework designed for video analysis and alignment. In particular, a novel dynamic warping loss function is designed to align videos of arbitrary length using attention-level features. A Temporal Segment Graph (TSG) is proposed to enable the adjacency matrix to cope with temporal information in video data. Our experimental results on two public datasets (Penn Action and Pouring) demonstrate significant improvements over state-of-the-art approaches.}, bibtype = {article}, author = {Wang, Junyan and Long, Yang and Pagnucco, Maurice and Song, Yang} }
@article{ title = {DeeperGCN: All You Need to Train Deeper GCNs}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2006.07739}, id = {f12d77f6-02aa-3a3d-aee1-1ab8daf84275}, created = {2021-12-09T14:47:57.168Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:48:01.824Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Graph Convolutional Networks (GCNs) have been drawing significant attention with the power of representation learning on graphs. Unlike Convolutional Neural Networks (CNNs), which are able to take advantage of stacking very deep layers, GCNs suffer from vanishing gradient, over-smoothing and over-fitting issues when going deeper. These challenges limit the representation power of GCNs on large-scale graphs. This paper proposes DeeperGCN that is capable of successfully and reliably training very deep GCNs. We define differentiable generalized aggregation functions to unify different message aggregation operations (e.g. mean, max). We also propose a novel normalization layer namely MsgNorm and a pre-activation version of residual connections for GCNs. Extensive experiments on Open Graph Benchmark (OGB) show DeeperGCN significantly boosts performance over the state-of-the-art on the large scale graph learning tasks of node property prediction and graph property prediction. Please visit https://www.deepgcns.org for more information.}, bibtype = {article}, author = {Li, Guohao and Xiong, Chenxin and Thabet, Ali and Ghanem, Bernard} }
@article{ title = {G-TAD: Sub-graph localization for temporal action detection}, type = {article}, year = {2020}, pages = {10153-10162}, id = {842c2919-83a3-36c0-85d8-d5396abf6194}, created = {2021-12-09T14:47:57.169Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:48:04.135Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Temporal action detection is a fundamental yet challenging task in video understanding. Video context is a critical cue to effectively detect actions, but current works mainly focus on temporal context, while neglecting semantic context as well as other important context properties. In this work, we propose a graph convolutional network (GCN) model to adaptively incorporate multi-level semantic context into video features and cast temporal action detection as a sub-graph localization problem. Specifically, we formulate video snippets as graph nodes, snippet-snippet correlations as edges, and actions associated with context as target sub-graphs. With graph convolution as the basic operation, we design a GCN block called GCNeXt, which learns the features of each node by aggregating its context and dynamically updates the edges in the graph. To localize each sub-graph, we also design an SGAlign layer to embed each sub-graph into the Euclidean space. Extensive experiments show that G-TAD is capable of finding effective video context without extra supervision and achieves state-of-the-art performance on two detection benchmarks. On ActivityNet-1.3 it obtains an average mAP of 34.09%; on THUMOS14 it reaches 51.6% at IoU@0.5 when combined with a proposal processing method. The code has been made available at https://github.com/frostinassiky/gtad.}, bibtype = {article}, author = {Xu, Mengmeng and Zhao, Chen and Rojas, David S. and Thabet, Ali and Ghanem, Bernard}, doi = {10.1109/CVPR42600.2020.01017}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {VLG-Net: Video-Language Graph Matching Network for Video Grounding}, type = {article}, year = {2020}, pages = {3224-3234}, websites = {http://arxiv.org/abs/2011.10132}, id = {8b37092c-3730-38e9-bf75-0f4bc9a35186}, created = {2021-12-09T14:47:57.287Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:48:05.129Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Grounding language queries in videos aims at identifying the time interval (or moment) semantically relevant to a language query. The solution to this challenging task demands the understanding of videos' and queries' semantic content and the fine-grained reasoning about their multi-modal interactions. Our key idea is to recast this challenge into an algorithmic graph matching problem. Fueled by recent advances in Graph Neural Networks, we propose to leverage Graph Convolutional Networks to model video and textual information as well as their semantic alignment. To enable the mutual exchange of information across the domains, we design a novel Video-Language Graph Matching Network (VLG-Net) to match video and query graphs. Core ingredients include representation graphs, built on top of video snippets and query tokens separately, which are used for modeling the intra-modality relationships. A Graph Matching layer is adopted for cross-modal context modeling and multi-modal fusion. Finally, moment candidates are created using masked moment attention pooling by fusing the moment's enriched snippet features. We demonstrate superior performance over state-of-the-art grounding methods on three widely used datasets for temporal localization of moments in videos with natural language queries: ActivityNet-Captions, TACoS, and DiDeMo.}, bibtype = {article}, author = {Qu, Sisi and Soldan, Mattia and Xu, Mengmeng and Tegner, Jesper and Ghanem, Bernard}, number = {12} }
@article{ title = {SGAS: Sequential greedy architecture search}, type = {article}, year = {2020}, pages = {1617-1627}, id = {31ccdcd6-c3e5-3e34-a451-c02bcf792965}, created = {2021-12-09T14:47:57.359Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:48:05.814Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Architecture design has become a crucial component of successful deep learning. Recent progress in automatic neural architecture search (NAS) shows a lot of promise. However, discovered architectures often fail to generalize in the final evaluation. Architectures with a higher validation accuracy during the search phase may perform worse in the evaluation (see Figure 1). Aiming to alleviate this common issue, we introduce sequential greedy architecture search (SGAS), an efficient method for neural architecture search. By dividing the search procedure into sub-problems, SGAS chooses and prunes candidate operations in a greedy fashion. We apply SGAS to search architectures for Convolutional Neural Networks (CNN) and Graph Convolutional Networks (GCN). Extensive experiments show that SGAS is able to find state-of-the-art architectures for tasks such as image classification, point cloud classification and node classification in protein-protein interaction graphs with minimal computational cost.}, bibtype = {article}, author = {Li, Guohao and Qian, Guocheng and Delgadillo, Itzel C. and Müller, Matthias and Thabet, Ali and Ghanem, Bernard}, doi = {10.1109/CVPR42600.2020.00169}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Uncertainty-Matching Graph Neural Networks to Defend Against Poisoning Attacks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2009.14455}, id = {34a255df-12d5-3fed-a6ba-c01ca256f499}, created = {2022-01-05T09:23:15.556Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:12.641Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs), a generalization of neural networks to graph-structured data, are often implemented using message passes between entities of a graph. While GNNs are effective for node classification, link prediction and graph classification, they are vulnerable to adversarial attacks, i.e., a small perturbation to the structure can lead to a non-trivial performance degradation. In this work, we propose Uncertainty Matching GNN (UM-GNN), that is aimed at improving the robustness of GNN models, particularly against poisoning attacks to the graph structure, by leveraging epistemic uncertainties from the message passing framework. More specifically, we propose to build a surrogate predictor that does not directly access the graph structure, but systematically extracts reliable knowledge from a standard GNN through a novel uncertainty-matching strategy. Interestingly, this uncoupling makes UM-GNN immune to evasion attacks by design, and achieves significantly improved robustness against poisoning attacks. Using empirical studies with standard benchmarks and a suite of global and target attacks, we demonstrate the effectiveness of UM-GNN, when compared to existing baselines including the state-of-the-art robust GCN.}, bibtype = {article}, author = {Shanthamallu, Uday Shankar and Thiagarajan, Jayaraman J. and Spanias, Andreas} }
@book{ title = {Model Extraction Attacks on Graph Neural Networks: Taxonomy and Realization}, type = {book}, year = {2020}, source = {Proceedings of the 2022 ACM Asia Conference on Computer and Communications Security (ASIA CCS '22), May 30-June 3, 2022, Nagasaki, Japan}, keywords = {2022,Graph Neural Networks,Model Extraction Attack,acm reference format,and xingliang yuan,bang wu,graph neural networks,model,model extraction attack,shirui pan,xiangwen yang}, volume = {1}, issue = {1}, websites = {http://arxiv.org/abs/2010.12751}, publisher = {Association for Computing Machinery}, id = {756d69e7-9177-3330-973d-b9e251240330}, created = {2022-01-05T09:23:15.672Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:30.842Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Machine learning models are shown to face a severe threat from Model Extraction Attacks, where a well-trained private model owned by a service provider can be stolen by an attacker pretending as a client. Unfortunately, prior works focus on the models trained over the Euclidean space, e.g., images and texts, while how to extract a GNN model that contains a graph structure and node features is yet to be explored. In this paper, for the first time, we comprehensively investigate and develop model extraction attacks against GNN models. We first systematically formalise the threat modelling in the context of GNN model extraction and classify the adversarial threats into seven categories by considering different background knowledge of the attacker, e.g., attributes and/or neighbour connections of the nodes obtained by the attacker. Then we present detailed methods which utilise the accessible knowledge in each threat to implement the attacks. By evaluating over three real-world datasets, our attacks are shown to extract duplicated models effectively, i.e., 84% - 89% of the inputs in the target domain have the same output predictions as the victim model.}, bibtype = {book}, author = {Wu, Bang and Yang, Xiangwen and Pan, Shirui and Yuan, Xingliang}, doi = {10.1145/3488932.3497753} }
@article{ title = {Certified robustness of graph convolution networks for graph classification under topological attacks}, type = {article}, year = {2020}, volume = {2020-Decem}, id = {ba326853-db01-3f2f-9d19-14826be027ad}, created = {2022-01-05T09:23:15.685Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:20.166Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph convolution networks (GCNs) have become effective models for graph classification. Similar to many deep networks, GCNs are vulnerable to adversarial attacks on graph topology and node attributes. Recently, a number of effective attack and defense algorithms have been designed, but no certificate of robustness has been developed for GCN-based graph classification under topological perturbations with both local and global budgets. In this paper, we propose the first certificate for this problem. Our method is based on Lagrange dualization and convex envelope, which result in tight approximation bounds that are efficiently computable by dynamic programming. When used in conjunction with robust training, it allows an increased number of graphs to be certified as robust.}, bibtype = {article}, author = {Jin, Hongwei and Shi, Zhan and Peruri, Ashish and Zhang, Xinhua}, journal = {Advances in Neural Information Processing Systems}, number = {Section 5} }
@article{ title = {A restricted black-box adversarial framework towards attacking graph embedding models}, type = {article}, year = {2020}, pages = {3389-3396}, id = {2812e56e-0163-3743-9d8c-8be8f2b535fb}, created = {2022-01-05T09:23:15.718Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:21.372Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {With the great success of graph embedding model on both academic and industry area, the robustness of graph embedding against adversarial attack inevitably becomes a central problem in graph learning domain. Regardless of the fruitful progress, most of the current works perform the attack in a white-box fashion: They need to access the model predictions and labels to construct their adversarial loss. However, the inaccessibility of model predictions in real systems makes the white-box attack impractical to real graph learning system. This paper promotes current frameworks in a more general and flexible sense - we demand to attack various kinds of graph embedding model with black-box driven. To this end, we begin by investigating the theoretical connections between graph signal processing and graph embedding models in a principled way and formulate the graph embedding model as a general graph signal process with corresponding graph filter. As such, a generalized adversarial attacker: GF-Attack is constructed by the graph filter and feature matrix. Instead of accessing any knowledge of the target classifiers used in graph embedding, GF-Attack performs the attack only on the graph filter in a black-box attack fashion. To validate the generalization of GF-Attack, we construct the attacker on four popular graph embedding models. Extensive experimental results validate the effectiveness of our attacker on several benchmark datasets. Particularly by using our attack, even small graph perturbations like one-edge flip is able to consistently make a strong attack in performance to different graph embedding models.}, bibtype = {article}, author = {Chang, Heng and Rong, Yu and Xu, Tingyang and Huang, Wenbing and Zhang, Honglei and Cui, Peng and Zhu, Wenwu and Huang, Junzhou}, doi = {10.1609/aaai.v34i04.5741}, journal = {AAAI 2020 - 34th AAAI Conference on Artificial Intelligence} }
@article{ title = {Convolutional Networks with Adaptive Inference Graphs}, type = {article}, year = {2020}, keywords = {Convolutional neural networks,Gumbel-Softmax,Residual networks}, pages = {730-741}, volume = {128}, id = {1a2ccc79-a109-3a43-aa50-ae6c72488db7}, created = {2022-01-05T09:23:15.855Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:13.706Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Do convolutional networks really need a fixed feed-forward structure? What if, after identifying the high-level concept of an image, a network could move directly to a layer that can distinguish fine-grained differences? Currently, a network would first need to execute sometimes hundreds of intermediate layers that specialize in unrelated aspects. Ideally, the more a network already knows about an image, the better it should be at deciding which layer to compute next. In this work, we propose convolutional networks with adaptive inference graphs (ConvNet-AIG) that adaptively define their network topology conditioned on the input image. Following a high-level structure similar to residual networks (ResNets), ConvNet-AIG decides for each input image on the fly which layers are needed. In experiments on ImageNet we show that ConvNet-AIG learns distinct inference graphs for different categories. Both ConvNet-AIG with 50 and 101 layers outperform their ResNet counterpart, while using 20 % and 38 % less computations respectively. By grouping parameters into layers for related classes and only executing relevant layers, ConvNet-AIG improves both efficiency and overall classification quality. Lastly, we also study the effect of adaptive inference graphs on the susceptibility towards adversarial examples. We observe that ConvNet-AIG shows a higher robustness than ResNets, complementing other known defense mechanisms.}, bibtype = {article}, author = {Veit, Andreas and Belongie, Serge}, doi = {10.1007/s11263-019-01190-4}, journal = {International Journal of Computer Vision}, number = {3} }
@article{ title = {GNNGUARD: Defending graph neural networks against adversarial attacks}, type = {article}, year = {2020}, volume = {2020-Decem}, id = {24060d95-9087-3d08-bb5d-1c103053458e}, created = {2022-01-05T09:23:15.861Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:34.121Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep learning methods for graphs achieve remarkable performance across a variety of domains. However, recent findings indicate that small, unnoticeable perturbations of graph structure can catastrophically reduce performance of even the strongest and most popular Graph Neural Networks (GNNs). Here, we develop GNNGUARD, a general algorithm to defend against a variety of training-time attacks that perturb the discrete graph structure. GNNGUARD can be straightforwardly incorporated into any GNN. Its core principle is to detect and quantify the relationship between the graph structure and node features, if one exists, and then exploit that relationship to mitigate negative effects of the attack. GNNGUARD learns how to best assign higher weights to edges connecting similar nodes while pruning edges between unrelated nodes. The revised edges allow for robust propagation of neural messages in the underlying GNN. GNNGUARD introduces two novel components, the neighbor importance estimation, and the layer-wise graph memory, and we show empirically that both components are necessary for a successful defense. Across five GNNs, three defense methods, and four datasets, including a challenging human disease graph, experiments show that GNNGUARD outperforms existing defense approaches by 15.3% on average. Remarkably, GNNGUARD can effectively restore state-of-the-art performance of GNNs in the face of various adversarial attacks, including targeted and non-targeted attacks, and can defend against attacks on heterophily graphs.}, bibtype = {article}, author = {Zhang, Xiang and Zitnik, Marinka}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS} }
@article{ title = {Graph Structure Learning for Robust Graph Neural Networks}, type = {article}, year = {2020}, keywords = {adversarial attacks,graph neural networks,robustness in machine learning}, pages = {66-74}, id = {f8afccc8-ff28-3d9e-8978-16e99f7ae109}, created = {2022-01-05T09:23:15.885Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:35.862Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs) are powerful tools in representation learning for graphs. However, recent studies show that GNNs are vulnerable to carefully-crafted perturbations, called adversarial attacks. Adversarial attacks can easily fool GNNs in making predictions for downstream tasks. The vulnerability to adversarial attacks has raised increasing concerns for applying GNNs in safety-critical applications. Therefore, developing robust algorithms to defend adversarial attacks is of great significance. A natural idea to defend adversarial attacks is to clean the perturbed graph. It is evident that real-world graphs share some intrinsic properties. For example, many real-world graphs are low-rank and sparse, and the features of two adjacent nodes tend to be similar. In fact, we find that adversarial attacks are likely to violate these graph properties. Therefore, in this paper, we explore these properties to defend adversarial attacks on graphs. In particular, we propose a general framework Pro-GNN, which can jointly learn a structural graph and a robust graph neural network model from the perturbed graph guided by these properties. Extensive experiments on real-world graphs demonstrate that the proposed framework achieves significantly better performance compared with the state-of-the-art defense methods, even when the graph is heavily perturbed. We release the implementation of Pro-GNN to our DeepRobust repository for adversarial attacks and defenses. The specific experimental settings to reproduce our results can be found in https://github.com/ChandlerBang/Pro-GNN.}, bibtype = {article}, author = {Jin, Wei and Ma, Yao and Liu, Xiaorui and Tang, Xianfeng and Wang, Suhang and Tang, Jiliang}, doi = {10.1145/3394486.3403049}, journal = {Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining} }
@article{ title = {Information Obfuscation of Graph Neural Networks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2009.13504}, id = {1c06be90-eba9-3508-8cfd-5a86dce8299b}, created = {2022-01-05T09:23:15.949Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:37.037Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {While the advent of Graph Neural Networks (GNNs) has greatly improved node and graph representation learning in many applications, the neighborhood aggregation scheme exposes additional vulnerabilities to adversaries seeking to extract node-level information about sensitive attributes. In this paper, we study the problem of protecting sensitive attributes by information obfuscation when learning with graph structured data. We propose a framework to locally filter out pre-determined sensitive attributes via adversarial training with the total variation and the Wasserstein distance. Our method creates a strong defense against inference attacks, while only suffering small loss in task performance. Theoretically, we analyze the effectiveness of our framework against a worst-case adversary, and characterize an inherent trade-off between maximizing predictive accuracy and minimizing information leakage. Experiments across multiple datasets from recommender systems, knowledge graphs and quantum chemistry demonstrate that the proposed approach provides a robust defense across various graph structures and tasks, while producing competitive GNN encoders for downstream tasks.}, bibtype = {article}, author = {Liao, Peiyuan and Zhao, Han and Xu, Keyulu and Jaakkola, Tommi and Gordon, Geoffrey and Jegelka, Stefanie and Salakhutdinov, Ruslan} }
@article{ title = {Adversarial Attacks on Graph Neural Networks via Node Injections: A Hierarchical Reinforcement Learning Approach}, type = {article}, year = {2020}, keywords = {Adversarial Attack,Graph Poisoning,Reinforcement learning;}, pages = {673-683}, id = {5489fbb0-49bf-34df-a654-1177a77a43a4}, created = {2022-01-05T09:23:16.008Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:44.273Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Neural Networks (GNN) offer the powerful approach to node classification in complex networks across many domains including social media, E-commerce, and FinTech. However, recent studies show that GNNs are vulnerable to attacks aimed at adversely impacting their node classification performance. Existing studies of adversarial attacks on GNN focus primarily on manipulating the connectivity between existing nodes, a task that requires greater effort on the part of the attacker in real-world applications. In contrast, it is much more expedient on the part of the attacker to inject adversarial nodes, e.g., fake profiles with forged links, into existing graphs so as to reduce the performance of the GNN in classifying existing nodes. Hence, we consider a novel form of node injection poisoning attacks on graph data. We model the key steps of a node injection attack, e.g., establishing links between the injected adversarial nodes and other nodes, choosing the label of an injected node, etc. by a Markov Decision Process. We propose a novel reinforcement learning method for Node Injection Poisoning Attacks (NIPA), to sequentially modify the labels and links of the injected nodes, without changing the connectivity between existing nodes. Specifically, we introduce a hierarchical Q-learning network to manipulate the labels of the adversarial nodes and their links with other nodes in the graph, and design an appropriate reward function to guide the reinforcement learning agent to reduce the node classification performance of GNN. The results of the experiments show that NIPA is consistently more effective than the baseline node injection attack methods for poisoning graph data on three benchmark datasets.}, bibtype = {article}, author = {Sun, Yiwei and Wang, Suhang and Tang, Xianfeng and Hsieh, Tsung Yu and Honavar, Vasant}, doi = {10.1145/3366423.3380149}, journal = {The Web Conference 2020 - Proceedings of the World Wide Web Conference, WWW 2020} }
@article{ title = {Defending Graph Convolutional Networks Against Adversarial Attacks}, type = {article}, year = {2020}, keywords = {Deep neural networks,dithering,graph convolutional networks,graph signals,robust learning}, pages = {8469-8473}, volume = {2020-May}, publisher = {IEEE}, id = {337d3444-2700-32a2-a1af-5e123f625b67}, created = {2022-01-05T09:23:16.022Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:46.573Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {The interconnection of social, email, and media platforms enables adversaries to manipulate networked data and promote their malicious intents. This paper introduces graph neural network architectures that are robust to perturbed networked data. The novel network utilizes a randomization layer that performs link-dithering (LD) by adding or removing links with probabilities selected to boost robustness. The resultant link-dithered auxiliary graphs are leveraged by an adaptive (A)GCN that performs SSL. The proposed robust LD-AGCN achieves performance gains relative to GCNs under perturbed network data.}, bibtype = {article}, author = {Ioannidis, Vassilis N. and Giannakis, Georgios B.}, doi = {10.1109/ICASSP40776.2020.9054325}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {Transferring robustness for graph neural network against poisoning attacks}, type = {article}, year = {2020}, keywords = {Adversarial defense,Robust graph neural networks}, pages = {600-608}, id = {b5ec41f5-e02a-3699-9c41-bb6585f962f2}, created = {2022-01-05T09:23:16.069Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:48.031Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks (GNNs) are widely used in many applications. However, their robustness against adversarial attacks is criticized. Prior studies show that using unnoticeable modifications on graph topology or nodal features can significantly reduce the performances of GNNs. It is very challenging to design robust graph neural networks against poisoning attack and several efforts have been taken. Existing work aims at reducing the negative impact from adversarial edges only with the poisoned graph, which is sub-optimal since they fail to discriminate adversarial edges from normal ones. On the other hand, clean graphs from similar domains as the target poisoned graph are usually available in the real world. By perturbing these clean graphs, we create supervised knowledge to train the ability to detect adversarial edges so that the robustness of GNNs is elevated. However, such potential for clean graphs is neglected by existing work. To this end, we investigate a novel problem of improving the robustness of GNNs against poisoning attacks by exploring clean graphs. Specifically, we propose PA-GNN, which relies on a penalized aggregation mechanism that directly restrict the negative impact of adversarial edges by assigning them lower attention coefficients. To optimize PA-GNN for a poisoned graph, we design a meta-optimization algorithm that trains PA-GNN to penalize perturbations using clean graphs and their adversarial counterparts, and transfers such ability to improve the robustness of PA-GNN on the poisoned graph. Experimental results on four real-world datasets demonstrate the robustness of PA-GNN against poisoning attacks on graphs.}, bibtype = {article}, author = {Tang, Xianfeng and Li, Yandong and Sun, Yiwei and Yao, Huaxiu and Mitra, Prasenjit and Wang, Suhang}, doi = {10.1145/3336191.3371851}, journal = {WSDM 2020 - Proceedings of the 13th International Conference on Web Search and Data Mining} }
@article{ title = {Adversarial Attacks on Graph Neural Networks: Perturbations and their Paterns}, type = {article}, year = {2020}, keywords = {Relational data,adversarial attacks,graph neural networks,poisoning attacks}, volume = {14}, id = {0a116fb1-5532-307e-91a7-519b0d91b2c1}, created = {2022-01-05T09:23:16.133Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:38.009Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep learning models for graphs have achieved strong performance for the task of node classification. Despite their proliferation, little is known about their robustness to adversarial attacks. Yet, in domains where they are likely to be used, e.g., the web, adversaries are common. Can deep learning models for graphs be easily fooled? In this work, we present a study of adversarial attacks on attributed graphs, specifically focusing on models exploiting ideas of graph convolutions. In addition to attacks at test time, we tackle the more challenging class of poisoning/causative attacks, which focus on the training phase of a machine learning model. We generate adversarial perturbations targeting the node's features and the graph structure, thus, taking the dependencies between instances in account. Moreover, we ensure that the perturbations remain unnoticeable by preserving important data characteristics. To cope with the underlying discrete domain, we propose an efficient algorithm Nettack exploiting incremental computations. Our experimental study shows that accuracy of node classification significantly drops even when performing only few perturbations. Even more, our attacks are transferable: the learned attacks generalize to other state-of-the-art node classification models and unsupervised approaches, and likewise are successful even when only limited knowledge about the graph is given. For the first time, we successfully identify important patterns of adversarial attacks on graph neural networks (GNNs)-a first step towards being able to detect adversarial attacks on GNNs.}, bibtype = {article}, author = {Zügner, Daniel and Borchert, Oliver and Akbarnejad, Amir and Günnemann, Stephan}, doi = {10.1145/3394520}, journal = {ACM Transactions on Knowledge Discovery from Data}, number = {5} }
@article{ title = {All you need is Low (rank): Defending against adversarial attacks on graphs}, type = {article}, year = {2020}, keywords = {Adversarial machine learning,Graph convolutional networks,Graph mining,Graph representation learning,Tensors}, pages = {169-177}, id = {d2b67d69-f384-3c7a-a604-0733d7b26ec3}, created = {2022-01-05T09:23:16.169Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:42.621Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Recent studies have demonstrated that machine learning approaches like deep learning methods are easily fooled by adversarial attacks. Recently, a highly-influential study examined the impact of adversarial attacks on graph data and demonstrated that graph embedding techniques are also vulnerable to adversarial attacks. Fake users on social media and fake product reviews are examples of perturbations in graph data that are realistic counterparts of the adversarial models proposed. Graphs are widely used in a variety of domains and it is highly important to develop graph analysis techniques that are robust to adversarial attacks. One of the recent studies on generating adversarial attacks for graph data is Nettack. The Nettack model has shown to be very successful in deceiving the Graph Convolutional Network (GCN) model. Nettack is also transferable to other node classification approaches e.g. node embeddings. In this paper, we explore the properties of Nettack perturbations, in search for effective defenses against them. Our first finding is that Nettack demonstrates a very specific behavior in the spectrum of the graph: only high-rank (low-valued) singular components of the graph are affected. Following that insight, we show that a low-rank approximation of the graph, that uses only the top singular components for its reconstruction, can greatly reduce the effects of Nettack and boost the performance of GCN when facing adversarial attacks. Indicatively, on the CiteSeer dataset, our proposed defense mechanism is able to reduce the success rate of Nettack from 98% to 36%. Furthermore, we show that tensor-based node embeddings, which by default project the graph into a low-rank subspace, are robust against Nettack perturbations. Lastly, we propose LowBlow, a low-rank adversarial attack which is able to affect the classification performance of both GCN and tensor-based node embeddings and we show that the low-rank attack is noticeable and making it unnoticeable results in a high-rank attack.}, bibtype = {article}, author = {Entezari, Negin and Al-Sayouri, Saba A. and Darvishzadeh, Amirali and Papalexakis, Evangelos E.}, doi = {10.1145/3336191.3371789}, journal = {WSDM 2020 - Proceedings of the 13th International Conference on Web Search and Data Mining} }
@article{ title = {Adversarial Attacks and Defenses on Graphs: A Review, A Tool and Empirical Studies}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2003.00653}, id = {2152dd23-3e1a-3705-85c7-244b4980a8d6}, created = {2022-01-05T09:23:16.170Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:43.290Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep neural networks (DNNs) have achieved significant performance in various tasks. However, recent studies have shown that DNNs can be easily fooled by small perturbation on the input, called adversarial attacks. As the extensions of DNNs to graphs, Graph Neural Networks (GNNs) have been demonstrated to inherit this vulnerability. Adversary can mislead GNNs to give wrong predictions by modifying the graph structure such as manipulating a few edges. This vulnerability has arisen tremendous concerns for adapting GNNs in safety-critical applications and has attracted increasing research attention in recent years. Thus, it is necessary and timely to provide a comprehensive overview of existing graph adversarial attacks and the countermeasures. In this survey, we categorize existing attacks and defenses, and review the corresponding state-of-the-art methods. Furthermore, we have developed a repository with representative algorithms (https://github.com/DSE-MSU/DeepRobust/tree/master/deeprobust/graph). The repository enables us to conduct empirical studies to deepen our understandings on attacks and defenses on graphs.}, bibtype = {article}, author = {Jin, Wei and Li, Yaxin and Xu, Han and Wang, Yiqi and Ji, Shuiwang and Aggarwal, Charu and Tang, Jiliang} }
@article{ title = {Towards more practical adversarial attacks on graph neural networks}, type = {article}, year = {2020}, volume = {2020-Decem}, id = {8a31483b-ed57-341d-bc5b-d777bed9784e}, created = {2022-01-05T09:23:16.269Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:39.979Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {We study the black-box attacks on graph neural networks (GNNs) under a novel and realistic constraint: attackers have access to only a subset of nodes in the network, and they can only attack a small number of them. A node selection step is essential under this setup. We demonstrate that the structural inductive biases of GNN models can be an effective source for this type of attacks. Specifically, by exploiting the connection between the backward propagation of GNNs and random walks, we show that the common gradient-based white-box attacks can be generalized to the black-box setting via the connection between the gradient and an importance score similar to PageRank. In practice, we find attacks based on this importance score indeed increase the classification loss by a large margin, but they fail to significantly increase the mis-classification rate. Our theoretical and empirical analyses suggest that there is a discrepancy between the loss and mis-classification rate, as the latter presents a diminishing-return pattern when the number of attacked nodes increases. Therefore, we propose a greedy procedure to correct the importance score that takes into account of the diminishing-return pattern. Experimental results show that the proposed procedure can significantly increase the mis-classification rate of common GNNs on real-world data without access to model parameters nor predictions.}, bibtype = {article}, author = {Ma, Jiaqi and Ding, Shuangrui and Mei, Qiaozhu}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS} }
@article{ title = {DeepRobust: A PyTorch Library for Adversarial Attacks and Defenses}, type = {article}, year = {2020}, pages = {1-19}, websites = {http://arxiv.org/abs/2005.06149}, id = {6e7f12ac-7bc3-3231-8c14-e1ebc4754896}, created = {2022-01-05T09:23:16.302Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:55.053Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {DeepRobust is a PyTorch adversarial learning library which aims to build a comprehensive and easy-to-use platform to foster this research field. It currently contains more than 10 attack algorithms and 8 defense algorithms in image domain and 9 attack algorithms and 4 defense algorithms in graph domain, under a variety of deep learning architectures. In this manual, we introduce the main contents of DeepRobust with detailed instructions. The library is kept updated and can be found at https://github.com/DSE-MSU/DeepRobust.}, bibtype = {article}, author = {Li, Yaxin and Jin, Wei and Xu, Han and Tang, Jiliang} }
@article{ title = {A restricted black-box adversarial framework towards attacking graph embedding models}, type = {article}, year = {2020}, keywords = {Machine Learning}, pages = {3389-3396}, id = {a462caeb-bfcf-3cc8-bec2-f1258aa1014e}, created = {2022-01-05T09:23:16.330Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:48.776Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {With the great success of graph embedding model on both academic and industry area, the robustness of graph embedding against adversarial attack inevitably becomes a central problem in graph learning domain. Regardless of the fruitful progress, most of the current works perform the attack in a white-box fashion: They need to access the model predictions and labels to construct their adversarial loss. However, the inaccessibility of model predictions in real systems makes the white-box attack impractical to real graph learning system. This paper promotes current frameworks in a more general and flexible sense - we demand to attack various kinds of graph embedding model with black-box driven. To this end, we begin by investigating the theoretical connections between graph signal processing and graph embedding models in a principled way and formulate the graph embedding model as a general graph signal process with corresponding graph filter. As such, a generalized adversarial attacker: GF-Attack is constructed by the graph filter and feature matrix. Instead of accessing any knowledge of the target classifiers used in graph embedding, GF-Attack performs the attack only on the graph filter in a black-box attack fashion. To validate the generalization of GF-Attack, we construct the attacker on four popular graph embedding models. Extensive experimental results validate the effectiveness of our attacker on several benchmark datasets. Particularly by using our attack, even small graph perturbations like one-edge flip is able to consistently make a strong attack in performance to different graph embedding models.}, bibtype = {article}, author = {Chang, Heng and Rong, Yu and Xu, Tingyang and Huang, Wenbing and Zhang, Honglei and Cui, Peng and Zhu, Wenwu and Huang, Junzhou}, doi = {10.1609/aaai.v34i04.5741}, journal = {AAAI 2020 - 34th AAAI Conference on Artificial Intelligence} }
@article{ title = {Exploratory adversarial attacks on graph neural networks}, type = {article}, year = {2020}, keywords = {Gradient-based attacks,Graph neural networks,Maximal gradient,Semi-supervised node classification}, pages = {1136-1141}, volume = {2020-Novem}, id = {53a5d0e2-c268-30ba-8e7d-f1cc80ed6dd6}, created = {2022-01-05T09:23:16.356Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:50.621Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks (GNNs) have been successfully used to analyze non-Euclidean network data. Recently, there emerge a number of works to investigate the robustness of GNNs by adding adversarial noises into the graph topology, where gradient-based attacks are widely studied due to their inherent efficiency and high effectiveness. However, the gradient-based attacks often lead to sub-optimal results due to the discrete structure of graph data. To this end, we design a novel exploratory adversarial attack (termed as EpoAtk) to boost the gradient-based perturbations on graphs. The exploratory strategy in EpoAtk includes three phases, generation, evaluation and recombination, with the goal of sidesteping the possible misinformation that the maximal gradient provides. In experiments, EpoAtk is evaluated on benchmark datasets for the task of semi-supervised node classification in different attack settings. Experimental results demonstrate that the proposed method significantly outperforms the state-of-the-art attacks with the same attack budgets11Our reproducible code is available at https://github.com/EpoAtk/EpoAtk..}, bibtype = {article}, author = {Lin, Xixun and Zhou, Chuan and Yang, Hong and Wu, Jia and Wang, Haibo and Cao, Yanan and Wang, Bin}, doi = {10.1109/ICDM50108.2020.00138}, journal = {Proceedings - IEEE International Conference on Data Mining, ICDM}, number = {Icdm} }
@article{ title = {DefenseVGAE: Defending against Adversarial Attacks on Graph Data via a Variational Graph Autoencoder}, type = {article}, year = {2020}, keywords = {adversarial defense,graph neural networks,variational}, websites = {http://arxiv.org/abs/2006.08900}, id = {1d5c0c55-9b25-3f98-9c9f-79d03b7f8d09}, created = {2022-01-05T09:23:16.607Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:59.432Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks (GNNs) achieve remarkable performance for tasks on graph data. However, recent works show they are extremely vulnerable to adversarial structural perturbations, making their outcomes unreliable. In this paper, we propose DefenseVGAE, a novel framework leveraging variational graph autoencoders(VGAEs) to defend GNNs against such attacks. DefenseVGAE is trained to reconstruct graph structure. The reconstructed adjacency matrix can reduce the effects of adversarial perturbations and boost the performance of GCNs when facing adversarial attacks. Our experiments on a number of datasets show the effectiveness of the proposed method under various threat models. Under some settings it outperforms existing defense strategies. Our code has been made publicly available at https://github.com/zhangao520/defense-vgae.}, bibtype = {article}, author = {Zhang, Ao and Ma, Jinwen} }
@article{ title = {Graph Adversarial Attacks and Defense: An Empirical Study on Citation Graph}, type = {article}, year = {2020}, keywords = {graph adversarial attacks,graph convolutional network,graph defense,graph neural network}, pages = {2553-2562}, id = {8edb066e-7e21-32c2-9a20-539686513312}, created = {2022-01-05T09:23:16.746Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:17.784Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {This paper details the methodologies and decisions making processes used while developing the attacking and defending models for the Graph Adversarial Attacks and Defense applied to a large citation graph. To handle the large graphs, our attack strategy is twofold: 1) randomly attack the structure first, 2) keep the structure unchanged, then continue the attack on the features using the gradient-based method. On the other hand, the defender is based on 1) filtering and normalizing the feature data, 2) applying the Graph Convolutional Network model, and 3) selecting the models with the highest accuracy and robustness based on our own attacking data. We applied these strategies in KDD Cup 2020 on Graph Adversarial Attacks and Defense dataset. The attacker can drop the accuracy of a surrogate 2-layer Graph Convolutional Network model from 60% to 30% on the test set. Our defending model has 68% accuracy on the validated data and has 89% of the target labels remained the same while adding fake nodes, generated by our attacking method, to the graph.}, bibtype = {article}, author = {Pham, Chau and Pham, Vung and Dang, Tommy}, doi = {10.1109/BigData50022.2020.9377988}, journal = {Proceedings - 2020 IEEE International Conference on Big Data, Big Data 2020} }
@article{ title = {Uncertainty-aware Attention Graph Neural Network for Defending Adversarial Attacks}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2009.10235}, id = {8efdd4f8-7262-39ba-bb81-426ecc0ae54c}, created = {2022-01-05T09:23:16.763Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:19.256Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {With the increasing popularity of graph-based learning, graph neural networks (GNNs) emerge as the essential tool for gaining insights from graphs. However, unlike the conventional CNNs that have been extensively explored and exhaustively tested, people are still worrying about the GNNs' robustness under the critical settings, such as financial services. The main reason is that existing GNNs usually serve as a black-box in predicting and do not provide the uncertainty on the predictions. On the other side, the recent advancement of Bayesian deep learning on CNNs has demonstrated its success of quantifying and explaining such uncertainties to fortify CNN models. Motivated by these observations, we propose UAG, the first systematic solution to defend adversarial attacks on GNNs through identifying and exploiting hierarchical uncertainties in GNNs. UAG develops a Bayesian Uncertainty Technique (BUT) to explicitly capture uncertainties in GNNs and further employs an Uncertainty-aware Attention Technique (UAT) to defend adversarial attacks on GNNs. Intensive experiments show that our proposed defense approach outperforms the state-of-the-art solutions by a significant margin.}, bibtype = {article}, author = {Feng, Boyuan and Wang, Yuke and Wang, Zheng and Ding, Yufei} }
@article{ title = {Adversarial autoencoders for compact representations of 3D point clouds}, type = {article}, year = {2020}, keywords = {Adversarial Autoencoders,Adversarial Learning,Deep Learning,Neural Networks,Point Clouds,Representation Learning}, pages = {102921}, volume = {193}, websites = {https://doi.org/10.1016/j.cviu.2020.102921}, publisher = {Elsevier Inc.}, id = {620834c9-015e-3aaa-93da-de06bb0472d1}, created = {2022-01-05T10:55:41.304Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.714Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zamorski2020}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Deep generative architectures provide a way to model not only images but also complex, 3-dimensional objects, such as point clouds. In this work, we present a novel method to obtain meaningful representations of 3D shapes that can be used for challenging tasks, including 3D points generation, reconstruction, compression, and clustering. Contrary to existing methods for 3D point cloud generation that train separate decoupled models for representation learning and generation, our approach is the first end-to-end solution that allows to simultaneously learn a latent space of representation and generate 3D shape out of it. Moreover, our model is capable of learning meaningful compact binary descriptors with adversarial training conducted on a latent space. To achieve this goal, we extend a deep Adversarial Autoencoder model (AAE) to accept 3D input and create 3D output. Thanks to our end-to-end training regime, the resulting method called 3D Adversarial Autoencoder (3dAAE) obtains either binary or continuous latent space that covers a much broader portion of training data distribution. Finally, our quantitative evaluation shows that 3dAAE provides state-of-the-art results for 3D points clustering and 3D object retrieval.}, bibtype = {article}, author = {Zamorski, Maciej and Zięba, Maciej and Klukowski, Piotr and Nowak, Rafał and Kurach, Karol and Stokowiec, Wojciech and Trzciński, Tomasz}, doi = {10.1016/j.cviu.2020.102921}, journal = {Computer Vision and Image Understanding}, number = {January} }
@article{ title = {Learning geometry-image representation for 3D point cloud generation}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2011.14289}, id = {7aa3b7eb-20eb-32e4-850d-6f305cfb6261}, created = {2022-01-05T11:42:39.153Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-10T16:03:27.839Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2020}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We study the problem of generating point clouds of 3D objects. Instead of discretizing the object into 3D voxels with huge computational cost and resolution limitations, we propose a novel geometry image based generator (GIG) to convert the 3D point cloud generation problem to a 2D geometry image generation problem. Since the geometry image is a completely regular 2D array that contains the surface points of the 3D object, it leverages both the regularity of the 2D array and the geodesic neighborhood of the 3D surface. Thus, one significant benefit of our GIG is that it allows us to directly generate the 3D point clouds using efficient 2D image generation networks. Experiments on both rigid and non-rigid 3D object datasets have demonstrated the promising performance of our method to not only create plausible and novel 3D objects, but also learn a probabilistic latent space that well supports the shape editing like interpolation and arithmetic.}, bibtype = {article}, author = {Wang, Lei and Huang, Yuchun and Tao, Pengjie and Hou, Yaolin and Liu, Yuxuan} }
@article{ title = {Feature Visualization for 3D Point Cloud Autoencoders}, type = {article}, year = {2020}, keywords = {autoencoder,deep learning,feature visualization}, id = {cd900c3d-4aa6-31dd-a2bc-399087c3659a}, created = {2022-01-05T11:42:39.269Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.211Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rios2020}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {In order to reduce the dimensionality of 3D point cloud representations, autoencoder architectures generate increasingly abstract, compressed features of the input data. Visualizing these features is central to understanding the learning process, however, while successful visualization techniques exist for neural networks applied to computer vision tasks, similar methods for geometric, especially non-Euclidean, input data are currently lacking. Hence, we propose a first-of-kind method to project the features learned by point cloud autoencoders into a 3D-space augmented with color maps. Our proposal explores the properties of 1D-convolutions, used in state-of-the art point cloud autoencoder architectures to handle the input data, which leads to an intuitive interpretation of the visualized features. Furthermore, we tackle the search for relevant co-activations in the feature space by clustering the input data in the latent space, where we explore the correspondence between network features and geometric characteristics of typical shapes of the clusters. We tested our approach with experiments on a benchmark data set, and with three different configurations of a point cloud autoencoder, where we show that the features learned by the autoencoder correlate with the occupancy of the input space by the training data.}, bibtype = {article}, author = {Rios, Thiago and Van Stein, Bas and Menzel, Stefan and Back, Thomas and Sendhoff, Bernhard and Wollstadt, Patricia}, doi = {10.1109/IJCNN48605.2020.9207326}, journal = {Proceedings of the International Joint Conference on Neural Networks} }
@article{ title = {Understanding Spectral Graph Neural Network}, type = {article}, year = {2020}, keywords = {graph neural network,spectral graph theory}, websites = {http://arxiv.org/abs/2012.06660%0Ahttp://dx.doi.org/10.13140/RG.2.2.27579.03364/1}, id = {9f6be84d-0b50-342a-84a7-f6eb412a4a5d}, created = {2022-01-14T16:04:12.044Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:27.011Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The graph neural networks have developed by leaps and bounds in recent years due to the restriction of traditional convolutional filters on non-Euclidean structured data. Spectral graph theory mainly studies fundamental graph properties using algebraic methods to analyze the spectrum of the adjacency matrix of a graph, which lays the foundation of graph convolutional neural networks. This report is more than notes and self-contained which comes from my Ph.D. first-year report literature review part, it illustrates how to link fundamentals of spectral graph theory to graph convolutional neural network theory, and discusses the major spectral-based graph convolutional neural networks. The practical applications of the graph neural networks defined in the spectral domain is also reviewed.}, bibtype = {article}, author = {Chen, Xinye}, doi = {10.13140/RG.2.2.27579.03364/1} }
@article{ title = {Stability Properties of Graph Neural Networks}, type = {article}, year = {2020}, keywords = {Graph convolutions,graph filters,graph neural networks,graph signal processing,network data,stability}, pages = {5680-5695}, volume = {68}, id = {8318e2f7-5ad7-372b-bbc5-e24b6b9244b8}, created = {2022-01-14T16:04:12.056Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:36.666Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph neural networks (GNNs) have emerged as a powerful tool for nonlinear processing of graph signals, exhibiting success in recommender systems, power outage prediction, and motion planning, among others. GNNs consist of a cascade of layers, each of which applies a graph convolution, followed by a pointwise nonlinearity. In this work, we study the impact that changes in the underlying topology have on the output of the GNN. First, we show that GNNs are permutation equivariant, which implies that they effectively exploit internal symmetries of the underlying topology. Then, we prove that graph convolutions with integral Lipschitz filters, in combination with the frequency mixing effect of the corresponding nonlinearities, yields an architecture that is both stable to small changes in the underlying topology, and discriminative of information located at high frequencies. These are two properties that cannot simultaneously hold when using only linear graph filters, which are either discriminative or stable, thus explaining the superior performance of GNNs.}, bibtype = {article}, author = {Gama, Fernando and Bruna, Joan and Ribeiro, Alejandro}, doi = {10.1109/TSP.2020.3026980}, journal = {IEEE Transactions on Signal Processing} }
@article{ title = {On the Stability of Polynomial Spectral Graph Filters}, type = {article}, year = {2020}, keywords = {Graph signal processing,spectral graph filters,stability analysis}, pages = {5350-5354}, volume = {2020-May}, publisher = {IEEE}, id = {39562172-27da-3666-a7f4-a91add2042ca}, created = {2022-01-14T16:04:12.082Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-18T13:30:24.181Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Spectral graph filters are a key component in state-of-the-art machine learning models used for graph-based learning, such as graph neural networks. For certain tasks stability of the spectral graph filters is important for learning suitable representations. Understanding the type of structural perturbation to which spectral graph filters are robust lets us reason as to when we may expect them to be well suited to a learning task. In this work, we first prove that polynomial graph filters are stable with respect to the change in the normalised graph Laplacian matrix. We then show empirically that properties of a structural perturbation, specifically the relative locality of the edges removed in a binary graph, effect the change in the normalised graph Laplacian. Together, our results have implications on designing robust graph filters and representations under structural perturbation.}, bibtype = {article}, author = {Kenlay, Henry and Thanou, Dorina and Dong, Xiaowen}, doi = {10.1109/ICASSP40776.2020.9054072}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {On the Experimental Transferability of Spectral Graph Convolutional Networks}, type = {article}, year = {2020}, id = {bc17e89e-929c-3e64-b43e-1052db1c7def}, created = {2022-01-14T16:04:12.216Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-23T13:20:21.563Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Thesis2020}, private_publication = {false}, bibtype = {article}, author = {Thesis, Master}, number = {June} }
@article{ title = {Spatial-Spectral Smooth Graph Convolutional Network for Multispectral Point Cloud Classification}, type = {article}, year = {2020}, keywords = {3D land cover classification,Multispectral point cloud,graph convolutional network,graph smoothness,spatial-spectral graph construction}, pages = {1062-1065}, id = {927b90c4-8c0d-3bdc-a6ed-db536a940a24}, created = {2022-01-14T16:04:12.322Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.558Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Multispectral point cloud, as a new type of data containing both spectrum and spatial geometry, opens the door to three-dimensional (3D) land cover classification at a finer scale. In this paper, we model the multispectral point cloud as a spatial-spectral graph and propose a smooth graph convolutional network for multispectral point cloud classification, abbreviated 3SGCN. We construct the spectral graph and spatial graph respectively to mine patterns in spectral and spatial geometric domains. Then, the multispectral point cloud graph is generated by combining the spatial and spectral graphs. For remote sensing scene classification tasks, it is usually desirable to make the classification map relatively smooth and avoid salt and pepper noise. Heat operator is introduced to enhance the low- frequency filters and enforce the smoothness in the graph signal. Further, a graph -based smoothness prior is deployed in our loss function. Experiments are conducted on real multispectral point cloud. The experimental results demonstrate that 3 SGCN can achieve significant improvements in comparison with several state-of-the art algori thms.}, bibtype = {article}, author = {Wang, Qingwang and Zhang, Xiangrong and Gu, Yanfeng}, doi = {10.1109/IGARSS39084.2020.9324584}, journal = {International Geoscience and Remote Sensing Symposium (IGARSS)} }
@article{ title = {A survey of traditional and deep learning-based feature descriptors for high dimensional data in computer vision}, type = {article}, year = {2020}, keywords = {Computer vision,Deep learning,Feature descriptors,High dimensional}, pages = {135-170}, volume = {9}, websites = {https://doi.org/10.1007/s13735-019-00183-w}, id = {2a075c28-315b-3f0b-a29d-f7e62a33a344}, created = {2022-01-27T08:19:32.893Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-28T07:27:10.506Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Higher dimensional data such as video and 3D are the leading edge of multimedia retrieval and computer vision research. In this survey, we give a comprehensive overview and key insights into the state of the art of higher dimensional features from deep learning and also traditional approaches. Current approaches are frequently using 3D information from the sensor or are using 3D in modeling and understanding the 3D world. With the growth of prevalent application areas such as 3D games, self-driving automobiles, health monitoring and sports activity training, a wide variety of new sensors have allowed researchers to develop feature description models beyond 2D. Although higher dimensional data enhance the performance of methods on numerous tasks, they can also introduce new challenges and problems. The higher dimensionality of the data often leads to more complicated structures which present additional problems in both extracting meaningful content and in adapting it for current machine learning algorithms. Due to the major importance of the evaluation process, we also present an overview of the current datasets and benchmarks. Moreover, based on more than 330 papers from this study, we present the major challenges and future directions.}, bibtype = {article}, author = {Georgiou, Theodoros and Liu, Yu and Chen, · Wei and Lew, · Michael}, doi = {10.1007/s13735-019-00183-w}, journal = {International Journal of Multimedia Information Retrieval} }
@article{ title = {Incorporating Handcrafted Features into Deep Learning for Point Cloud Classification}, type = {article}, year = {2020}, keywords = {classification,deep learning,feature extraction,point cloud}, pages = {3713}, volume = {12}, websites = {https://www.mdpi.com/2072-4292/12/22/3713/htm,https://www.mdpi.com/2072-4292/12/22/3713}, month = {11}, publisher = {Multidisciplinary Digital Publishing Institute}, day = {12}, id = {89135d99-0577-3ec1-b042-c7ee2e96e7a8}, created = {2022-01-27T08:31:44.699Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-28T07:27:10.292Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, notes = {<b>Main idea:</b> The main concept is that the authors use current methods like PointNet and PointNet++ to insert various handmade features into these networks and compare classification results. The impact of handmade features on point cloud categorization was investigated by the authors.<br/><br/><b>No code available!</b>}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Point cloud classification is an important task in point cloud data analysis. Traditional point cloud classification is conducted primarily on the basis of specific handcrafted features with a specific classifier and is often capable of producing satisfactory results. However, the extraction of crucial handcrafted features hinges on sufficient knowledge of the field and substantial experience. In contrast, while powerful deep learning algorithms possess the ability to learn features automatically, it normally requires complex network architecture and a considerable amount of calculation time to attain better accuracy of classification. In order to combine the advantages of both the methods, in this study, we integrated the handcrafted features, whose benefits were confirmed by previous studies, into a deep learning network, in the hopes of solving the problem of insufficient extraction of specific features and enabling the network to recognise other effective features through automatic learning. This was done to achieve the performance of a complex model by using a simple model and fulfil the application requirements of the remote sensing domain. As indicated by the experimental results, the integration of handcrafted features into the simple and fast-calculating PointNet model could generate a classification result that bore comparison with that generated by a complex network model such as PointNet++ or KPConv.}, bibtype = {article}, author = {Hsu, Pai Hui and Zhuang, Zong Yi}, doi = {10.3390/RS12223713}, journal = {Remote Sensing 2020, Vol. 12, Page 3713}, number = {22} }
@article{ title = {D3Feat: Joint learning of dense detection and description of 3D local features}, type = {article}, year = {2020}, pages = {6358-6366}, id = {074a0c1c-9f51-30e8-a6b7-571421b71236}, created = {2022-02-01T13:16:00.292Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-01T13:16:05.181Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8d18e62e-6e66-4acb-ae6a-b470435041d8}, private_publication = {false}, abstract = {A successful point cloud registration often lies on robust establishment of sparse matches through discriminative 3D local features. Despite the fast evolution of learning-based 3D feature descriptors, little attention has been drawn to the learning of 3D feature detectors, even less for a joint learning of the two tasks. In this paper, we leverage a 3D fully convolutional network for 3D point clouds, and propose a novel and practical learning mechanism that densely predicts both a detection score and a description feature for each 3D point. In particular, we propose a keypoint selection strategy that overcomes the inherent density variations of 3D point clouds, and further propose a self-supervised detector loss guided by the on-the-fly feature matching results during training. Finally, our method achieves state-of-the-art results in both indoor and outdoor scenarios, evaluated on 3DMatch and KITTI datasets, and shows its strong generalization ability on the ETH dataset. Towards practical use, we show that by adopting a reliable feature detector, sampling a smaller number of features is sufficient to achieve accurate and fast point cloud alignment.}, bibtype = {article}, author = {Bai, Xuyang and Luo, Zixin and Zhou, Lei and Fu, Hongbo and Quan, Long and Tai, Chiew Lan}, doi = {10.1109/CVPR42600.2020.00639}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Convolution in the cloud: Learning deformable kernels in 3D graph convolution networks for point cloud analysis}, type = {article}, year = {2020}, pages = {1797-1806}, id = {d6a6b09f-9b3d-3dbd-8d38-35ce4d7f7401}, created = {2022-02-15T11:01:25.909Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-15T11:01:32.314Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8315fdc0-e3a9-47f0-9186-21b3433d86d2}, private_publication = {false}, abstract = {Point clouds are among the popular geometry representations for 3D vision applications. However, without regular structures like 2D images, processing and summarizing information over these unordered data points are very challenging. Although a number of previous works attempt to analyze point clouds and achieve promising performances, their performances would degrade significantly when data variations like shift and scale changes are presented. In this paper, we propose 3D Graph Convolution Networks (3D-GCN), which is designed to extract local 3D features from point clouds across scales, while shift and scale-invariance properties are introduced. The novelty of our 3D-GCN lies in the definition of learnable kernels with a graph max-pooling mechanism. We show that 3D-GCN can be applied to 3D classification and segmentation tasks, with ablation studies and visualizations verifying the design of 3D-GCN. Our code is publicly available at https://github.com/j1a0m0e4sNTU/3dgcn.}, bibtype = {article}, author = {Lin, Zhi Hao and Huang, Sheng Yu and Wang, Yu Chiang Frank}, doi = {10.1109/CVPR42600.2020.00187}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Convolution in the cloud: Learning deformable kernels in 3D graph convolution networks for point cloud analysis}, type = {article}, year = {2020}, pages = {1797-1806}, volume = {1}, id = {7c891689-c4ab-3b59-b3e6-504dd444bc89}, created = {2022-02-15T11:01:25.913Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-15T11:03:55.496Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8315fdc0-e3a9-47f0-9186-21b3433d86d2}, private_publication = {false}, abstract = {Point clouds are among the popular geometry representations for 3D vision applications. However, without regular structures like 2D images, processing and summarizing information over these unordered data points are very challenging. Although a number of previous works attempt to analyze point clouds and achieve promising performances, their performances would degrade significantly when data variations like shift and scale changes are presented. In this paper, we propose 3D Graph Convolution Networks (3D-GCN), which is designed to extract local 3D features from point clouds across scales, while shift and scale-invariance properties are introduced. The novelty of our 3D-GCN lies in the definition of learnable kernels with a graph max-pooling mechanism. We show that 3D-GCN can be applied to 3D classification and segmentation tasks, with ablation studies and visualizations verifying the design of 3D-GCN. Our code is publicly available at https://github.com/j1a0m0e4sNTU/3dgcn.}, bibtype = {article}, author = {Lin, Zhi Hao and Huang, Sheng Yu and Wang, Yu Chiang Frank}, doi = {10.1109/CVPR42600.2020.00187}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, number = {c} }
@article{ title = {Graphite: Graph-Induced Feature Extraction for Point Cloud Registration}, type = {article}, year = {2020}, keywords = {3D Descriptors,Graph Neural Networks,Keypoint Extraction,Point Cloud Processing,Point Cloud Registration}, pages = {241-251}, id = {d04192d7-4017-3d15-9b23-ae11e9747dab}, created = {2022-02-21T06:44:13.993Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T06:25:24.870Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {3D Point clouds are a rich source of information that enjoy growing popularity in the vision community. However, due to the sparsity of their representation, learning models based on large point clouds is still a challenge. In this work, we introduce Graphite, a GRAPH-Induced feaTure Extraction pipeline, a simple yet powerful feature transform and keypoint detector. Graphite enables intensive down-sampling of point clouds with keypoint detection accompanied by a descriptor. We construct a generic graph-based learning scheme to describe point cloud regions and extract salient points. To this end, we take advantage of 6D pose information and metric learning to learn robust descriptions and keypoints across different scans. We Reformulate the 3D keypoint pipeline with graph neural networks which allow efficient processing of the point set while boosting its descriptive power which ultimately results in more accurate 3D registrations. We demonstrate our lightweight descriptor on common 3D descriptor matching and point cloud registration benchmarks [76], [71] and achieve comparable results with the state of the art. Describing 100 patches of a point cloud and detecting their keypoints takes only 0.018 seconds with our proposed network.}, bibtype = {article}, author = {Saleh, Mahdi and Dehghani, Shervin and Busam, Benjamin and Navab, Nassir and Tombari, Federico}, doi = {10.1109/3DV50981.2020.00034}, journal = {Proceedings - 2020 International Conference on 3D Vision, 3DV 2020} }
@article{ title = {PC-Net: A deep network for 3D point clouds analysis}, type = {article}, year = {2020}, pages = {465-472}, id = {47dcb0d2-68e8-3349-a082-0aeaf8995a1d}, created = {2022-02-21T06:44:13.997Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T06:25:24.842Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {Due to the irregularity and sparsity of 3D point clouds, applying convolutional neural networks directly on them can be nontrivial. In this work, we propose a simple but effective approach for 3D Point Clouds analysis, named PC-Net. PC-Net directly learns on point sets and is equipped with three new operations: first, we apply a novel scale-aware neighbor search for adaptive neighborhood extracting; second, for each neighboring point, we learn a local spatial feature as a complement to their associated features; finally, at the end we use a distance re-weighted pooling to aggregate all the features from local structure. With this module, we design hierarchical neural network for point cloud understanding. For both classification and segmentation tasks, our architecture proves effective in the experiments and our models demonstrate state-of-the-art performance over existing deep learning methods on popular point cloud benchmarks.}, bibtype = {article}, author = {Chen, Zhuo and Guan, Tao and Luo, Yawei and Wang, Yuesong and Luo, Keyang and Xu, Luoyuan}, doi = {10.1109/ICPR48806.2021.9412136}, journal = {Proceedings - International Conference on Pattern Recognition} }
@article{ title = {A Dynamic Reduction Network for Point Clouds}, type = {article}, year = {2020}, pages = {2-5}, websites = {http://arxiv.org/abs/2003.08013}, id = {8261629d-4604-33b5-96e0-59551b6169b6}, created = {2022-02-21T06:44:14.190Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T06:25:24.929Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {Classifying whole images is a classic problem in machine learning, and graph neural networks are a powerful methodology to learn highly irregular geometries. It is often the case that certain parts of a point cloud are more important than others when determining overall classification. On graph structures this started by pooling information at the end of convolutional filters, and has evolved to a variety of staged pooling techniques on static graphs. In this paper, a dynamic graph formulation of pooling is introduced that removes the need for predetermined graph structure. It achieves this by dynamically learning the most important relationships between data via an intermediate clustering. The network architecture yields interesting results considering representation size and efficiency. It also adapts easily to a large number of tasks from image classification to energy regression in high energy particle physics.}, bibtype = {article}, author = {Gray, Lindsey and Klijnsma, Thomas and Ghosh, Shamik} }
@article{ title = {Polarnet: An improved grid representation for online Lidar point clouds semantic segmentation}, type = {article}, year = {2020}, pages = {9598-9607}, id = {1089939c-3672-37ef-9908-0c47e4d1fa8d}, created = {2022-02-21T06:44:14.197Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T09:08:42.964Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {The need for fine-grained perception in autonomous driving systems has resulted in recently increased research on online semantic segmentation of single-scan LiDAR. Despite the emerging datasets and technological advancements, it remains challenging due to three reasons: (1) the need for near-real-time latency with limited hardware; (2) uneven or even long-tailed distribution of LiDAR points across space; and (3) an increasing number of extremely fine-grained semantic classes. In an attempt to jointly tackle all the aforementioned challenges, we propose a new LiDAR-specific, nearest-neighbor-free segmentation algorithm - PolarNet. Instead of using common spherical or bird's-eye-view projection, our polar bird's-eye-view representation balances the points across grid cells in a polar coordinate system, indirectly aligning a segmentation network's attention with the long-tailed distribution of the points along the radial axis. We find that our encoding scheme greatly increases the mIoU in three drastically different segmentation datasets of real urban LiDAR single scans while retaining near real-time throughput.}, bibtype = {article}, author = {Zhang, Yang and Zhou, Zixiang and David, Philip and Yue, Xiangyu and Xi, Zerong and Gong, Boqing and Foroosh, Hassan}, doi = {10.1109/CVPR42600.2020.00962}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Mesorasi: Architecture support for point cloud analytics via delayed-aggregation}, type = {article}, year = {2020}, keywords = {Accelerator,DNN,Point cloud}, pages = {1037-1050}, volume = {2020-Octob}, id = {dedf3912-e911-399d-8a44-23555bc5a3ec}, created = {2022-02-21T06:44:14.385Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T06:25:24.941Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {Point cloud analytics is poised to become a key workload on battery-powered embedded and mobile platforms in a wide range of emerging application domains, such as autonomous driving, robotics, and augmented reality, where efficiency is paramount. This paper proposes Mesorasi, an algorithm-architecture co-designed system that simultaneously improves the performance and energy efficiency of point cloud analytics while retaining its accuracy.Our extensive characterizations of state-of-the-art point cloud algorithms show that, while structurally reminiscent of convolutional neural networks (CNNs), point cloud algorithms exhibit inherent compute and memory inefficiencies due to the unique characteristics of point cloud data. We propose delayed-aggregation, a new algorithmic primitive for building efficient point cloud algorithms. Delayed-aggregation hides the performance bottlenecks and reduces the compute and memory redundancies by exploiting the approximately distributive property of key operations in point cloud algorithms. Delayed-aggregation let point cloud algorithms achieve 1.6× speedup and 51.1% energy reduction on a mobile GPU while retaining the accuracy (-0.9% loss to 1.2% gains). To maximize the algorithmic benefits, we propose minor extensions to contemporary CNN accelerators, which can be integrated into a mobile Systems-on-a-Chip (SoC) without modifying other SoC components. With additional hardware support, Mesorasi achieves up to 3.6× speedup.}, bibtype = {article}, author = {Feng, Yu and Tian, Boyuan and Xu, Tiancheng and Whatmough, Paul and Zhu, Yuhao}, doi = {10.1109/MICRO50266.2020.00087}, journal = {Proceedings of the Annual International Symposium on Microarchitecture, MICRO} }
@article{ title = {Multi-Scale Dynamic Graph Convolution Network for Point Clouds Classification}, type = {article}, year = {2020}, keywords = {Point clouds,farthest point sampling,graph convolutional neural networks,k-NN group}, pages = {65591-65598}, volume = {8}, publisher = {IEEE}, id = {9584dccb-4541-39aa-bae6-fb1840ab0d45}, created = {2022-02-21T06:44:14.469Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-22T06:25:24.933Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {Point clouds provide an efficient way for 3D geometric object representation. In order to deal with the classification and segmentation of point cloud, it is very important to design an efficient and intelligent model that can directly affect point cloud. Due to the irregularity of the data format, traditional convolutional neural networks cannot be applied to point clouds processing directly. Graph convolution network (GCN) has attracted more and more attention in recent years, especially in the field of non-Euclidean data processing. Point clouds processing with GCN models is an efficient and suitable method, a lot of GCN models have achieved state-of-the-art performance on irregular data processing challenges. In this paper, we propose a Multi-scale Dynamic GCN model for point clouds classification, a Farthest Point Sampling method is applied in our model firstly to efficiently cover the entire point set, it uses different scale k-NN group method to locate on k nearest neighborhood for each central node, Edge Convolution (EdgeConv) operation is used to extract and aggregate local features between neighbor connected nodes and central node. We use ModelNet40, ModelNet10 and ShapeNet part dataset to classify point clouds and segment them semantically. Experiments show that our model achieves a better performance on classification accuracy and model complexity than other state-of-the-art models.}, bibtype = {article}, author = {Zhai, Zhengli and Zhang, Xin and Yao, Luyao}, doi = {10.1109/ACCESS.2020.2985279}, journal = {IEEE Access} }
@article{ title = {Unsupervised semantic and instance segmentation of forest point clouds}, type = {article}, year = {2020}, keywords = {Component classification,Segmentation,Superpoint graph,Terrestrial LiDAR,Tree isolation}, pages = {86-97}, volume = {165}, websites = {https://doi.org/10.1016/j.isprsjprs.2020.04.020}, publisher = {Elsevier}, id = {ead8c699-2f3b-3682-b4d6-ee68f93c8a41}, created = {2022-02-23T06:26:16.850Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-23T06:32:43.956Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5041aa39-a3cf-45bd-ada3-df1401e124f1}, private_publication = {false}, abstract = {Terrestrial Laser Scanning (TLS) has been increasingly used in forestry applications including forest inventory and plant ecology. Tree biophysical properties such as leaf area distributions and wood volumes can be accurately estimated from TLS point clouds. In these applications, a prerequisite is to properly understand the information content of large scale point clouds (i.e., semantic labelling of point clouds), so that tree-scale attributes can be retrieved. Currently, this requirement is undergoing laborious and time consuming manual works. In this work, we jointly address the problems of semantic and instance segmentation of forest point clouds. Specifically, we propose an unsupervised pipeline based on a structure called superpoint graph, to simultaneously perform two tasks: single tree isolation and leaf-wood classification. The proposed method is free from restricted assumptions of forest types. Validation using simulated data resulted in a mean Intersection over Union (mIoU) of 0.81 for single tree isolation, and an overall accuracy of 87.7% for leaf-wood classification. The single tree isolation led to a relative root mean square error (RMSE%) of 2.9% and 19.8% for tree height and crown diameter estimations, respectively. Comparisons with existing methods on other benchmark datasets showed state-of-the-art results of our method on both single tree isolation and leaf-wood classification tasks. We provide the entire framework as an open-source tool with an end-user interface. This study closes the gap for using TLS point clouds to quantify tree-scale properties in large areas, where automatic interpretation of the information content of TLS point clouds remains a crucial challenge.}, bibtype = {article}, author = {Wang, Di}, doi = {10.1016/j.isprsjprs.2020.04.020}, journal = {ISPRS Journal of Photogrammetry and Remote Sensing}, number = {April} }
@article{ title = {Linking Points With Labels in 3D}, type = {article}, year = {2020}, id = {f8800d40-1127-3a71-b718-ce70eb113c63}, created = {2022-02-24T07:10:18.729Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-24T13:55:40.902Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, bibtype = {article}, author = {Xie, Yuxing and Tian, Jiaojiao and Zhu, Xiao Xiang}, journal = {IEEE Geoscience and Remote Sensing Magazine}, number = {March} }
@article{ title = {Variational Autoencoder for 3D Voxel Compression}, type = {article}, year = {2020}, pages = {3-8}, volume = {2020-Novem}, id = {455f06a5-f0bb-364a-9d95-63a3b5e45f4c}, created = {2022-03-08T11:04:43.643Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.045Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Liu2020}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {3D scene sensing and understanding is a fundamental task in the field of computer vision and robotics. One widely used representation for 3D data is a voxel grid. However, explicit representation of 3D voxels always requires large storage space, which is not suitable for light-weight applications and scenarios such as robotic navigation and exploration. In this paper we propose a method to compress 3D voxel grids using an octree representation and Variational Autoencoders (VAEs). We first capture a 3D voxel grid-in our application with collaborating Realsense D435 and T265 cameras. The voxel grid is decomposed into three types of octants which are then compressed by the encoder and reproduced by feeding the latent code into the decoder. We demonstrate the efficiency of our method by two applications: scene reconstruction and path planing.}, bibtype = {article}, author = {Liu, Juncheng and Mills, Steven and McCane, Brendan}, doi = {10.1109/IVCNZ51579.2020.9290656}, journal = {International Conference Image and Vision Computing New Zealand} }
@article{ title = {Semantic labeling and instance segmentation of 3d point clouds using patch context analysis and multiscale processing}, type = {article}, year = {2020}, keywords = {Segmentation,clustering,labeling,patch context,scene understanding,semantics}, pages = {2485-2498}, volume = {26}, id = {03e6beb9-b6cc-3cc7-8fd7-2947772cf373}, created = {2022-03-09T09:00:28.167Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-10T10:05:34.402Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {We present a novel algorithm for semantic segmentation and labeling of 3D point clouds of indoor scenes, where objects in point clouds can have significant variations and complex configurations. Effective segmentation methods decomposing point clouds into semantically meaningful pieces are highly desirable for object recognition, scene understanding, scene modeling, etc. However, existing segmentation methods based on low-level geometry tend to either under-segment or over-segment point clouds. Our method takes a fundamentally different approach, where semantic segmentation is achieved along with labeling. To cope with substantial shape variation for objects in the same category, we first segment point clouds into surface patches and use unsupervised clustering to group patches in the training set into clusters, providing an intermediate representation for effectively learning patch relationships. During testing, we propose a novel patch segmentation and classification framework with multiscale processing, where the local segmentation level is automatically determined by exploiting the learned cluster based contextual information. Our method thus produces robust patch segmentation and semantic labeling results, avoiding parameter sensitivity. We further learn object-cluster relationships from the training set, and produce semantically meaningful object level segmentation. Our method outperforms state-of-the-art methods on several representative point cloud datasets, including S3DIS, SceneNN, Cornell RGB-D and ETH.}, bibtype = {article}, author = {Hu, Shi Min and Cai, Jun Xiong and Lai, Yu Kun}, doi = {10.1109/TVCG.2018.2889944}, journal = {IEEE Transactions on Visualization and Computer Graphics}, number = {7} }
@article{ title = {PGCNet: patch graph convolutional network for point cloud segmentation of indoor scenes}, type = {article}, year = {2020}, keywords = {Edge convolution,Encoder–decoder,Graph convolutional network,Point cloud,Scene segmentation,Surface patch}, pages = {2407-2418}, volume = {36}, websites = {https://doi.org/10.1007/s00371-020-01892-8}, publisher = {Springer Berlin Heidelberg}, id = {ef05bfb2-f9a5-3de5-9331-66d54aab7a2a}, created = {2022-03-09T09:00:28.167Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-10T14:04:37.324Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {Semantic segmentation of 3D point clouds is a crucial task in scene understanding and is also fundamental to indoor scene applications such as indoor navigation, mobile robotics, augmented reality. Recently, deep learning frameworks have been successfully adopted to point clouds but are limited by the size of data. While most existing works focus on individual sampling points, we use surface patches as a more efficient representation and propose a novel indoor scene segmentation framework called patch graph convolution network (PGCNet). This framework treats patches as input graph nodes and subsequently aggregates neighboring node features by dynamic graph U-Net (DGU) module, which consists of dynamic edge convolution operation inside U-shaped encoder–decoder architecture. The DGU module dynamically update graph structures at each level to encode hierarchical edge features. Incorporating PGCNet, we can segment the input scene into two types, i.e., room layout and indoor objects, which is afterward utilized to carry out final rich semantic labeling of various indoor scenes. With considerable speedup training, the proposed framework achieves effective performance equivalent to state-of-the-art for segmenting standard indoor scene dataset.}, bibtype = {article}, author = {Sun, Yuliang and Miao, Yongwei and Chen, Jiazhou and Pajarola, Renato}, doi = {10.1007/s00371-020-01892-8}, journal = {Visual Computer}, number = {10-12} }
@article{ title = {ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes}, type = {article}, year = {2020}, pages = {4403-4412}, id = {10268821-c242-3dd5-8492-a5dae44763d7}, created = {2022-03-09T09:35:30.329Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T09:35:38.890Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {3D object detection has seen quick progress thanks to advances in deep learning on point clouds. A few recent works have even shown state-of-The-Art performance with just point clouds input (e.g. VoteNet). However, point cloud data have inherent limitations. They are sparse, lack color information and often suffer from sensor noise. Images, on the other hand, have high resolution and rich texture. Thus they can complement the 3D geometry provided by point clouds. Yet how to effectively use image information to assist point cloud based detection is still an open question. In this work, we build on top of VoteNet and propose a 3D detection architecture called ImVoteNet specialized for RGB-D scenes. ImVoteNet is based on fusing 2D votes in images and 3D votes in point clouds. Compared to prior work on multi-modal detection, we explicitly extract both geometric and semantic features from the 2D images. We leverage camera parameters to lift these features to 3D. To improve the synergy of 2D-3D feature fusion, we also propose a multi-Tower training scheme. We validate our model on the challenging SUN RGB-D dataset, advancing state-of-The-Art results by 5.7 mAP. We also provide rich ablation studies to analyze the contribution of each design choice.}, bibtype = {article}, author = {Qi, Charles R. and Chen, Xinlei and Litany, Or and Guibas, Leonidas J.}, doi = {10.1109/CVPR42600.2020.00446}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Multi-Modal Anomaly Detection for Unstructured and Uncertain Environments}, type = {article}, year = {2020}, keywords = {anomaly detection,feature learning,field robots}, websites = {http://arxiv.org/abs/2012.08637}, id = {7460c6a9-8639-3d27-84c5-04564e89e9ee}, created = {2022-03-23T06:17:59.061Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:13.906Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ji2020}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {To achieve high-levels of autonomy, modern robots require the ability to detect and recover from anomalies and failures with minimal human supervision. Multi-modal sensor signals could provide more information for such anomaly detection tasks; however, the fusion of high-dimensional and heterogeneous sensor modalities remains a challenging problem. We propose a deep learning neural network: supervised variational autoencoder (SVAE), for failure identification in unstructured and uncertain environments. Our model leverages the representational power of VAE to extract robust features from high-dimensional inputs for supervised learning tasks. The training objective unifies the generative model and the discriminative model, thus making the learning a one-stage procedure. Our experiments on real field robot data demonstrate superior failure identification performance than baseline methods, and that our model learns interpretable representations. Videos of our results are available on our website: https://sites.google.com/illinois.edu/supervised-vae .}, bibtype = {article}, author = {Ji, Tianchen and Vuppala, Sri Theja and Chowdhary, Girish and Driggs-Campbell, Katherine}, number = {CoRL} }
@article{ title = {Two-Stage Relation Constraint for Semantic Segmentation of Point Clouds}, type = {article}, year = {2020}, keywords = {Point clouds,Semantic Segmentation}, pages = {271-280}, id = {105ffc63-9186-39e7-a517-e2e100aff9d5}, created = {2022-03-23T06:17:59.063Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.521Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yu2020}, private_publication = {false}, abstract = {Key to point cloud semantic segmentation is to learn discriminative representations involving of capturing effective relations among points. Many works add hard constraints on points through predefined convolution kernels. Motivated by label propagation algorithm, we develop Dynamic Adjustable Group Propagation (DAGP) with a dynamic adjustable scale module approximating distance parameter. Based on DAGP, we develop a novel Two Stage Propagation framework (TSP) to add intra-group and intergroup relation constraints on representations to enhance the discrimination of features from different group levels. We adopt well-appreciated backbone to extract features for input point cloud and then divide them into groups. DAGP is utilized to propagate information within each group in first stage. To promote information dissemination between groups more efficiently, a selection strategy is introduced to select group-pairs for second stage which propagating labels among selected group-pairs by DAGP. By training with this new learning architecture, the backbone network is enforced to mine relational context information within and between groups without introducing any extra computation burden during inference. Extensive experimental results show that TSP significantly improves the performance of existing popular architectures (PointNet, PointNet++, DGCNN) on large scene segmentation benchmarks (S3DIS, ScanNet) and part segmentation dataset ShapeNet.}, bibtype = {article}, author = {Yu, Minghui and Liu, Jinxian and Ni, Bingbing and Li, Caiyuan}, doi = {10.1109/3DV50981.2020.00037}, journal = {Proceedings - 2020 International Conference on 3D Vision, 3DV 2020} }
@article{ title = {NVAE: A deep hierarchical variational autoencoder}, type = {article}, year = {2020}, pages = {1-21}, volume = {2020-Decem}, id = {a50620b6-42d2-3f09-8a47-1bb3af39eb57}, created = {2022-03-23T06:17:59.367Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.586Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Vahdat2020}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Normalizing flows, autoregressive models, variational autoencoders (VAEs), and deep energy-based models are among competing likelihood-based frameworks for deep generative learning. Among them, VAEs have the advantage of fast and tractable sampling and easy-to-access encoding networks. However, they are currently outperformed by other models such as normalizing flows and autoregressive models. While the majority of the research in VAEs is focused on the statistical challenges, we explore the orthogonal direction of carefully designing neural architectures for hierarchical VAEs. We propose Nouveau VAE (NVAE), a deep hierarchical VAE built for image generation using depth-wise separable convolutions and batch normalization. NVAE is equipped with a residual parameterization of Normal distributions and its training is stabilized by spectral regularization. We show that NVAE achieves state-of-the-art results among non-autoregressive likelihood-based models on the MNIST, CIFAR-10, CelebA 64, and CelebA HQ datasets and it provides a strong baseline on FFHQ. For example, on CIFAR-10, NVAE pushes the state-of-the-art from 2.98 to 2.91 bits per dimension, and it produces high-quality images on CelebA HQ as shown in Fig. 1. To the best of our knowledge, NVAE is the first successful VAE applied to natural images as large as 256×256 pixels. The source code is available at https://github.com/NVlabs/NVAE.}, bibtype = {article}, author = {Vahdat, Arash and Kautz, Jan}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS} }
@article{ title = {Self-Supervised Out-of-Distribution Detection in Brain CT Scans}, type = {article}, year = {2020}, keywords = {Computer Science - Computer Vision and Pattern Rec,Electrical Engineering and Systems Science - Imag}, websites = {http://arxiv.org/abs/2011.05428}, month = {11}, id = {82bdd8b6-3f15-3a50-b08e-e485c074bcd6}, created = {2022-03-28T09:45:00.683Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:10.407Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {venkatakrishnanSelfSupervisedOutofDistributionDetection2020}, source_type = {article}, notes = {arXiv: 2011.05428}, private_publication = {false}, abstract = {Medical imaging data suffers from the limited availability of annotation because annotating 3D medical data is a time-consuming and expensive task. Moreover, even if the annotation is available, supervised learning-based approaches suffer highly imbalanced data. Most of the scans during the screening are from normal subjects, but there are also large variations in abnormal cases. To address these issues, recently, unsupervised deep anomaly detection methods that train the model on large-sized normal scans and detect abnormal scans by calculating reconstruction error have been reported. In this paper, we propose a novel self-supervised learning technique for anomaly detection. Our architecture largely consists of two parts: 1) Reconstruction and 2) predicting geometric transformations. By training the network to predict geometric transformations, the model could learn better image features and distribution of normal scans. In the test time, the geometric transformation predictor can assign the anomaly score by calculating the error between geometric transformation and prediction. Moreover, we further use self-supervised learning with context restoration for pretraining our model. By comparative experiments on clinical brain CT scans, the effectiveness of the proposed method has been verified.}, bibtype = {article}, author = {Venkatakrishnan, Abinav Ravi and Kim, Seong Tae and Eisawy, Rami and Pfister, Franz and Navab, Nassir}, journal = {arXiv:2011.05428 [cs, eess]} }
@article{ title = {Green AI}, type = {article}, year = {2020}, pages = {54-63}, volume = {63}, websites = {https://doi.org/10.1145/3381831}, month = {11}, id = {9b5fcdd4-5591-3cea-8c19-94154f4c5946}, created = {2022-03-28T09:45:00.808Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-21T21:14:58.664Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {schwartzGreenAI2020}, source_type = {article}, private_publication = {false}, abstract = {Creating efficiency in AI research will decrease its carbon footprint and increase its inclusivity as deep learning study should not require the deepest pockets.}, bibtype = {article}, author = {Schwartz, Roy and Dodge, Jesse and Smith, Noah A and Etzioni, Oren}, doi = {10.1145/3381831}, journal = {Communications of the ACM}, number = {12} }
@article{ title = {Összehasonlítás a 2D-s és 3D-s tárgyfelismerő technikák között a robot navigációban: Comparison between 2D and 3D Object Recognition Techniques for Mobile Robot Navigation}, type = {article}, year = {2020}, keywords = {tér feltérképezés}, pages = {137-141}, websites = {https://ojs.emt.ro/index.php/enelko-szamokt/article/view/330}, month = {10}, id = {f5a6e69b-e5c1-38cf-bed9-cdeea887af1a}, created = {2022-03-28T09:45:01.806Z}, accessed = {2022-01-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:25.309Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {szilardOsszehasonlitas2DsEs2020}, source_type = {article}, short_title = {Összehasonlítás a 2D-s és 3D-s tárgyfelismerő }, private_publication = {false}, abstract = {This is a short presentation about the comparison between a 2D object recognition technique, YOLO, and a 3D object recognition technique with Kaolin. These techniques are analyzed, and used in navigating a small robot through a hallway. After presenting the used hardware elements, we will conclude the results of the comparison according to the effectiveness of the application. Kivonat Ez egy rövid leírás egy projektről, amelyikben egy összehasonlítunk egy 2D-s tárgyfelismerő technikát, a YOLO-t, és egy 3D-s tárgyfelismerő technikát, a Kaolint. Ezeknek a működését analizáljuk, majd felhasználjuk őket egy kisméretű robot folyosón való irányításánál. Ismertetjük a használt eszközöket, majd az alkalmazás hatékonyságát elemezve, levonjuk az összehasonlítás következményét.}, bibtype = {article}, author = {Szilárd, Molnár and Levente, Tamás}, journal = {Energetika-Elektrotechnika – Számítástechnika és Oktatás Multi-konferencia} }
@article{ title = {From Variational to Deterministic Autoencoders}, type = {article}, year = {2020}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1903.12436}, month = {5}, id = {51be2152-bdcf-3f5f-9544-c62b4968a935}, created = {2022-03-28T09:45:01.989Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:46.891Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {ghoshVariationalDeterministicAutoencoders2020}, source_type = {article}, notes = {arXiv: 1903.12436}, private_publication = {false}, abstract = {Variational Autoencoders (VAEs) provide a theoretically-backed and popular framework for deep generative models. However, learning a VAE from data poses still unanswered theoretical questions and considerable practical challenges. In this work, we propose an alternative framework for generative modeling that is simpler, easier to train, and deterministic, yet has many of the advantages of VAEs. We observe that sampling a stochastic encoder in a Gaussian VAE can be interpreted as simply injecting noise into the input of a deterministic decoder. We investigate how substituting this kind of stochasticity, with other explicit and implicit regularization schemes, can lead to an equally smooth and meaningful latent space without forcing it to conform to an arbitrarily chosen prior. To retrieve a generative mechanism to sample new data, we introduce an ex-post density estimation step that can be readily applied also to existing VAEs, improving their sample quality. We show, in a rigorous empirical study, that the proposed regularized deterministic autoencoders are able to generate samples that are comparable to, or better than, those of VAEs and more powerful alternatives when applied to images as well as to structured data such as molecules. \textbackslashfootnote\An implementation is available at: \textbackslashurl\https://github.com/ParthaEth/Regularized\_autoencoders-RAE-\\}, bibtype = {article}, author = {Ghosh, Partha and Sajjadi, Mehdi S M and Vergari, Antonio and Black, Michael and Schölkopf, Bernhard}, journal = {arXiv:1903.12436 [cs, stat]} }
@inproceedings{ title = {Variational Autoencoder for 3D Voxel Compression}, type = {inproceedings}, year = {2020}, keywords = {Computational modeling,Data models,Image reconstruction,Octrees,Solid modeling,Task analysis,Three-dimensional displays}, pages = {1-6}, month = {11}, id = {9de0e1bc-0393-301b-9e34-a37a2fcfd82b}, created = {2022-03-28T09:45:02.106Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:26.589Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liuVariationalAutoencoder3D2020a}, source_type = {inproceedings}, notes = {ISSN: 2151-2205}, private_publication = {false}, abstract = {3D scene sensing and understanding is a fundamental task in the field of computer vision and robotics. One widely used representation for 3D data is a voxel grid. However, explicit representation of 3D voxels always requires large storage space, which is not suitable for light-weight applications and scenarios such as robotic navigation and exploration. In this paper we propose a method to compress 3D voxel grids using an octree representation and Variational Autoencoders (VAEs). We first capture a 3D voxel grid -in our application with collaborating Realsense D435 and T265 cameras. The voxel grid is decomposed into three types of octants which are then compressed by the encoder and reproduced by feeding the latent code into the decoder. We demonstrate the efficiency of our method by two applications: scene reconstruction and path planning.}, bibtype = {inproceedings}, author = {Liu, Juncheng and Mills, Steven and McCane, Brendan}, doi = {10.1109/IVCNZ51579.2020.9290656}, booktitle = {2020 35th International Conference on Image and Vision Computing New Zealand (IVCNZ)} }
@article{ title = {Semantic segmentation of point clouds of building interiors with deep learning: Augmenting training datasets with synthetic BIM-based point clouds}, type = {article}, year = {2020}, keywords = {Building information model,Deep learning algorithm,Point clouds,Semantic segmentation,Synthetic dataset}, pages = {103144}, volume = {113}, websites = {https://www.sciencedirect.com/science/article/pii/S0926580519311884}, month = {5}, id = {050d44c0-acc4-3fd3-a3ab-c85b8f6c0c2f}, created = {2022-03-28T09:45:02.227Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:28.887Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {maSemanticSegmentationPoint2020}, source_type = {article}, short_title = {Semantic segmentation of point clouds of building }, private_publication = {false}, abstract = {This paper investigates the viability of using synthetic point clouds generated from building information models (BIMs) to train deep neural networks to perform semantic segmentation of point clouds of building interiors. In order to achieve these goals, this paper first presents a procedure for converting digital 3D BIMs into synthetic point clouds using three commercially available software systems. Then the generated synthetic point clouds are used to train a deep neural network. Semantic segmentation performance is compared for several models trained on: real point clouds, synthetic point clouds, and combinations of real and synthetic point clouds. A key finding is the 7.1\% IOU boost in performance achieved when a small real point cloud dataset is augmented by synthetic point clouds for training, as compared to training the classifier on the real data alone. The experimental results confirmed the viability of using synthetic point clouds generated from building information models in combination with small datasets of real point clouds. This opens up the possibility of developing a segmentation model for building interiors that can be applied to as-built modeling of buildings that contain unseen indoor structures.}, bibtype = {article}, author = {Ma, Jong Won and Czerniawski, Thomas and Leite, Fernanda}, doi = {10.1016/j.autcon.2020.103144}, journal = {Automation in Construction} }
@inproceedings{ title = {Quantifying The Generative Capabilities Of Variational Autoencoders For 3D Car Point Clouds}, type = {inproceedings}, year = {2020}, keywords = {Analytical models,Automobiles,Computer architecture,Data models,Representation learning,Shape,Solid modeling,Three-dimensional displays,generative model,geometric deep learning,novelty,point clouds}, pages = {1469-1477}, month = {12}, id = {8a008ad3-899c-36d9-9200-68a074788931}, created = {2022-03-28T09:45:02.803Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:30.977Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sahaQuantifyingGenerativeCapabilities2020}, source_type = {inproceedings}, private_publication = {false}, abstract = {During each cycle of automotive development, large amounts of geometric data are generated as results of design studies and simulation tasks. Discovering hidden knowledge from this data and making it available to the development team strengthens the design process by utilizing historic information when creating novel products. To this end, we propose to use powerful geometric deep learning models that learn lowdimensional representation of the design data in an unsupervised fashion. Trained models allow to efficiently explore the design space, as well as to generate novel designs. One popular class of generative models are variational autoencoders, which have however been rarely applied to geometric data. Hence, we use a variational autoencoder for 3D point clouds (PC-VAE) and explore the model's generative capabilities with a focus on the generation of realistic yet novel 3D shapes. We apply the PC-VAE to point clouds sampled from car shapes from a benchmark data set and employ quantitative measures to show that our PC-VAE generates realistic car shapes, wile returning a richer variety of unseen shapes compared to a baseline autoencoder. Finally, we demonstrate how the PC-VAE can be guided towards generating shapes with desired target properties by optimizing the parameters that maximize the output of a trained classifier for said target properties. We conclude that generative models are a powerful tool that may aid designers in automotive product development.}, bibtype = {inproceedings}, author = {Saha, Sneha and Menzel, Stefan and Minku, Leandro L and Yao, Xin and Sendhoff, Bernhard and Wollstadt, Patricia}, doi = {10.1109/SSCI47803.2020.9308513}, booktitle = {2020 IEEE Symposium Series on Computational Intelligence (SSCI)} }
@article{ title = {NTU RGB+D 120: A Large-Scale Benchmark for 3D Human Activity Understanding}, type = {article}, year = {2020}, keywords = {3D action recognition,Activity understanding,Benchmark testing,Cameras,Deep learning,Lighting,RGB+D vision,Semantics,Skeleton,Three-dimensional displays,deep learning,large-scale benchmark,video analysis}, pages = {2684-2701}, volume = {42}, month = {10}, id = {02adb193-051c-3a9e-9b93-87752c0b0fa9}, created = {2022-03-28T09:45:03.559Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:49.201Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liuNTURGB1202020}, source_type = {article}, short_title = {NTU RGB+D 120}, notes = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence}, private_publication = {false}, abstract = {Research on depth-based human activity analysis achieved outstanding performance and demonstrated the effectiveness of 3D representation for action recognition. The existing depth-based and RGB+D-based action recognition benchmarks have a number of limitations, including the lack of large-scale training samples, realistic number of distinct class categories, diversity in camera views, varied environmental conditions, and variety of human subjects. In this work, we introduce a large-scale dataset for RGB+D human action recognition, which is collected from 106 distinct subjects and contains more than 114 thousand video samples and 8 million frames. This dataset contains 120 different action classes including daily, mutual, and health-related activities. We evaluate the performance of a series of existing 3D activity analysis methods on this dataset, and show the advantage of applying deep learning methods for 3D-based human action recognition. Furthermore, we investigate a novel one-shot 3D activity recognition problem on our dataset, and a simple yet effective Action-Part Semantic Relevance-aware (APSR) framework is proposed for this task, which yields promising results for recognition of the novel action classes. We believe the introduction of this large-scale dataset will enable the community to apply, adapt, and develop various data-hungry learning techniques for depth-based and RGB+D-based human activity understanding.}, bibtype = {article}, author = {Liu, Jun and Shahroudy, Amir and Perez, Mauricio and Wang, Gang and Duan, Ling-Yu and Kot, Alex C}, doi = {10.1109/TPAMI.2019.2916873}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {10} }
@inproceedings{ title = {NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis}, type = {inproceedings}, year = {2020}, keywords = {3D deep learning,Image-based rendering,Scene representation,View synthesis,Volume rendering}, pages = {405-421}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {6ac4ea82-cba6-35b0-8e4d-ebd391837521}, created = {2022-03-28T09:45:03.773Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:59.073Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {mildenhallNeRFRepresentingScenes2020}, source_type = {inproceedings}, short_title = {NeRF}, private_publication = {false}, abstract = {We present a method that achieves state-of-the-art results for synthesizing novel views of complex scenes by optimizing an underlying continuous volumetric scene function using a sparse set of input views. Our algorithm represents a scene using a fully-connected (non-convolutional) deep network, whose input is a single continuous 5D coordinate (spatial location (x, y, z) and viewing direction (θ,ϕ)(θ,ϕ)(\textbackslashtheta ,\textbackslashphi )) and whose output is the volume density and view-dependent emitted radiance at that spatial location. We synthesize views by querying 5D coordinates along camera rays and use classic volume rendering techniques to project the output colors and densities into an image. Because volume rendering is naturally differentiable, the only input required to optimize our representation is a set of images with known camera poses. We describe how to effectively optimize neural radiance fields to render photorealistic novel views of scenes with complicated geometry and appearance, and demonstrate results that outperform prior work on neural rendering and view synthesis. View synthesis results are best viewed as videos, so we urge readers to view our supplementary video for convincing comparisons.}, bibtype = {inproceedings}, author = {Mildenhall, Ben and Srinivasan, Pratul P and Tancik, Matthew and Barron, Jonathan T and Ramamoorthi, Ravi and Ng, Ren}, editor = {Vedaldi, Andrea and Bischof, Horst and Brox, Thomas and Frahm, Jan-Michael}, doi = {10.1007/978-3-030-58452-8_24}, booktitle = {Computer Vision – ECCV 2020} }
@article{ title = {MRGAN: Multi-Rooted 3D Shape Generation with Unsupervised Part Disentanglement}, type = {article}, year = {2020}, keywords = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/2007.12944}, month = {7}, id = {357eefc7-14fb-3141-957c-d56e2925f0f4}, created = {2022-03-28T09:45:04.146Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:04.501Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {galMRGANMultiRooted3D2020}, source_type = {article}, short_title = {MRGAN}, notes = {arXiv: 2007.12944}, private_publication = {false}, abstract = {We present MRGAN, a multi-rooted adversarial network which generates part-disentangled 3D point-cloud shapes without part-based shape supervision. The network fuses multiple branches of tree-structured graph convolution layers which produce point clouds, with learnable constant inputs at the tree roots. Each branch learns to grow a different shape part, offering control over the shape generation at the part level. Our network encourages disentangled generation of semantic parts via two key ingredients: a root-mixing training strategy which helps decorrelate the different branches to facilitate disentanglement, and a set of loss terms designed with part disentanglement and shape semantics in mind. Of these, a novel convexity loss incentivizes the generation of parts that are more convex, as semantic parts tend to be. In addition, a root-dropping loss further ensures that each root seeds a single part, preventing the degeneration or over-growth of the point-producing branches. We evaluate the performance of our network on a number of 3D shape classes, and offer qualitative and quantitative comparisons to previous works and baseline approaches. We demonstrate the controllability offered by our part-disentangled generation through two applications for shape modeling: part mixing and individual part variation, without receiving segmented shapes as input.}, bibtype = {article}, author = {Gal, Rinon and Bermano, Amit and Zhang, Hao and Cohen-Or, Daniel}, journal = {arXiv:2007.12944 [cs]} }
@inproceedings{ title = {Adversarial Discriminative Attention for Robust Anomaly Detection}, type = {inproceedings}, year = {2020}, pages = {2172-2181}, websites = {https://openaccess.thecvf.com/content_WACV_2020/html/Kimura_Adversarial_Discriminative_Attention_for_Robust_Anomaly_Detection_WACV_2020_paper.html}, id = {fca2fc32-b088-3b1f-a664-ef1a628a7df2}, created = {2022-03-28T09:45:04.173Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:00.033Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kimuraAdversarialDiscriminativeAttention2020}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Kimura, Daiki and Chaudhury, Subhajit and Narita, Minori and Munawar, Asim and Tachibana, Ryuki} }
@article{ title = {Balancing Reconstruction Error and Kullback-Leibler Divergence in Variational Autoencoders}, type = {article}, year = {2020}, keywords = {Data models,Gaussian distribution,Generative models,Image reconstruction,Kullback-Leibler divergence,Mathematical model,Probabilistic logic,Shape,Training,likelilhood-based frameworks,two-stage generation,variational autoencoders}, pages = {199440-199448}, volume = {8}, id = {e6996e89-07c3-30cc-8304-c47db64fcba4}, created = {2022-03-28T09:45:04.251Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:49.091Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {aspertiBalancingReconstructionError2020}, source_type = {article}, notes = {Conference Name: IEEE Access}, private_publication = {false}, abstract = {Likelihood-based generative frameworks are receiving increasing attention in the deep learning community, mostly on account of their strong probabilistic foundation. Among them, Variational Autoencoders (VAEs) are reputed for their fast and tractable sampling and relatively stable training, but if not properly tuned they may easily produce poor generative performances. The loss function of Variational Autoencoders is the sum of two components, with somehow contrasting effects: the reconstruction loss, improving the quality of the resulting images, and the Kullback-Leibler divergence, acting as a regularizer of the latent space. Correctly balancing these two components is a delicate issue, and one of the major problems of VAEs. Recent techniques address the problem by allowing the network to learn the balancing factor during training, according to a suitable loss function. In this article, we show that learning can be replaced by a simple deterministic computation, expressing the balancing factor in terms of a running average of the reconstruction error over the last minibatches. As a result, we keep a constant balance between the two components along training: as reconstruction improves, we proportionally decrease KL-divergence in order to prevent its prevalence, that would forbid further improvements of the quality of reconstructions. Our technique is simple and effective: it clarifies the learning objective for the balancing factor, and it produces faster and more accurate behaviours. On typical datasets such as Cifar10 and CelebA, our technique sensibly outperforms all previous VAE architectures with comparable parameter capacity.}, bibtype = {article}, author = {Asperti, Andrea and Trentin, Matteo}, doi = {10.1109/ACCESS.2020.3034828}, journal = {IEEE Access} }
@inproceedings{ title = {Regularized Autoencoders via Relaxed Injective Probability Flow}, type = {inproceedings}, year = {2020}, pages = {4292-4301}, websites = {https://proceedings.mlr.press/v108/kumar20a.html}, month = {6}, publisher = {PMLR}, id = {905cacb2-03ba-3990-aef3-ac48507277dc}, created = {2022-03-28T09:45:04.692Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:11.593Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kumarRegularizedAutoencodersRelaxed2020}, source_type = {inproceedings}, notes = {ISSN: 2640-3498}, private_publication = {false}, abstract = {Invertible flow-based generative models are an effective method for learning to generate samples, while allowing for tractable likelihood computation and inference. However, the invertibility requirement restricts models to have the same latent dimensionality as the inputs. This imposes significant architectural, memory, and computational costs, making them more challenging to scale than other classes of generative models such as Variational Autoencoders (VAEs). We propose a generative model based on probability flows that does away with the bijectivity requirement on the model and only assumes injectivity. This also provides another perspective on regularized autoencoders (RAEs), with our final objectives resembling RAEs with specific regularizers that are derived by lower bounding the probability flow objective. We empirically demonstrate the promise of the proposed model, improving over VAEs and AEs in terms of sample quality.}, bibtype = {inproceedings}, author = {Kumar, Abhishek and Poole, Ben and Murphy, Kevin}, booktitle = {Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics} }
@inbook{ type = {inbook}, year = {2020}, keywords = {3d animation,3d motion generation,lie algebra,variational auto-encoder}, pages = {2021-2029}, websites = {https://doi.org/10.1145/3394171.3413635}, month = {10}, publisher = {Association for Computing Machinery}, city = {New York, NY, USA}, id = {586b03b5-d524-3511-bf00-11019fa58f81}, created = {2022-03-28T09:45:05.425Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:35.403Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {guoAction2MotionConditionedGeneration2020}, source_type = {incollection}, short_title = {Action2Motion}, private_publication = {false}, abstract = {Action recognition is a relatively established task, where given an input sequence of human motion, the goal is to predict its action category. This paper, on the other hand, considers a relatively new problem, which could be thought of as an inverse of action recognition: given a prescribed action type, we aim to generate plausible human motion sequences in 3D. Importantly, the set of generated motions are expected to maintain its diversity to be able to explore the entire action-conditioned motion space; meanwhile, each sampled sequence faithfully resembles a natural human body articulation dynamics. Motivated by these objectives, we follow the physics law of human kinematics by adopting the Lie Algebra theory to represent the natural human motions; we also propose a temporal Variational Auto-Encoder (VAE) that encourages a diverse sampling of the motion space. A new 3D human motion dataset, HumanAct12, is also constructed. Empirical experiments over three distinct human motion datasets (including ours) demonstrate the effectiveness of our approach.}, bibtype = {inbook}, author = {Guo, Chuan and Zuo, Xinxin and Wang, Sen and Zou, Shihao and Sun, Qingyao and Deng, Annan and Gong, Minglun and Cheng, Li}, chapter = {Action2Motion: Conditioned Generation of 3D Human Motions}, title = {Proceedings of the 28th ACM International Conference on Multimedia} }
@inproceedings{ title = {VIBE: Video Inference for Human Body Pose and Shape Estimation}, type = {inproceedings}, year = {2020}, pages = {5253-5263}, websites = {https://openaccess.thecvf.com/content_CVPR_2020/html/Kocabas_VIBE_Video_Inference_for_Human_Body_Pose_and_Shape_Estimation_CVPR_2020_paper.html}, id = {c2e86d16-e246-3e94-8985-7ae0e959bb82}, created = {2022-03-28T09:45:05.746Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:39.155Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kocabasVIBEVideoInference2020}, source_type = {inproceedings}, short_title = {VIBE}, private_publication = {false}, bibtype = {inproceedings}, author = {Kocabas, Muhammed and Athanasiou, Nikos and Black, Michael J} }
@inproceedings{ title = {History Repeats Itself: Human Motion Prediction via Motion Attention}, type = {inproceedings}, year = {2020}, keywords = {Human motion prediction,Motion attention}, pages = {474-489}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {49c94765-5923-3171-8252-3c58c2a5ecd7}, created = {2022-03-28T09:45:05.820Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:43.128Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {maoHistoryRepeatsItself2020}, source_type = {inproceedings}, short_title = {History Repeats Itself}, private_publication = {false}, abstract = {Human motion prediction aims to forecast future human poses given a past motion. Whether based on recurrent or feed-forward neural networks, existing methods fail to model the observation that human motion tends to repeat itself, even for complex sports actions and cooking activities. Here, we introduce an attention-based feed-forward network that explicitly leverages this observation. In particular, instead of modeling frame-wise attention via pose similarity, we propose to extract motion attention to capture the similarity between the current motion context and the historical motion sub-sequences. Aggregating the relevant past motions and processing the result with a graph convolutional network allows us to effectively exploit motion patterns from the long-term history to predict the future poses. Our experiments on Human3.6M, AMASS and 3DPW evidence the benefits of our approach for both periodical and non-periodical actions. Thanks to our attention model, it yields state-of-the-art results on all three datasets. Our code is available at https://github.com/wei-mao-2019/HisRepItself.}, bibtype = {inproceedings}, author = {Mao, Wei and Liu, Miaomiao and Salzmann, Mathieu}, editor = {Vedaldi, Andrea and Bischof, Horst and Brox, Thomas and Frahm, Jan-Michael}, doi = {10.1007/978-3-030-58568-6_28}, booktitle = {Computer Vision – ECCV 2020} }
@inproceedings{ title = {A Comprehensive Study of Autoencoders' Applications Related to Images}, type = {inproceedings}, year = {2020}, id = {e4ad01c8-837a-3467-9f25-41a206947ff8}, created = {2022-03-28T09:45:05.880Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:05.880Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kovenkoComprehensiveStudyAutoencoders2020a}, source_type = {inproceedings}, private_publication = {false}, abstract = {It’s shown that using warm-up for VAE with respect to KL loss gives much more plausible results in terms of image generation, and the VAE (variational autoencoder) is highlighted. This article incorporates a comprehensive study of autoencoders’ applications related to images. First of all, a vanilla autoencoder is described along with details of its architecture and training procedure. Secondly, main methods for regularization of it are exposed, such as dropout and additive gaussian noise. The applications of autoencoders such as image morphing, reconstruction and search are shown. Then, the VAE (variational autoencoder) is highlighted. Main applications of it such as outliers detection and image generation are described. Finally, it’s shown that using warm-up for VAE with respect to KL loss gives much more plausible results in terms of image generation.}, bibtype = {inproceedings}, author = {Kovenko, V and Bogach, I}, booktitle = {IT\&I Workshops} }
@article{ title = {SurVAE Flows: Surjections to Bridge the Gap between VAEs and Flows}, type = {article}, year = {2020}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/2007.02731}, month = {10}, id = {3e55c063-adfd-3413-9cdd-4356cb3f7801}, created = {2022-03-28T09:45:06.070Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:23:36.655Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {nielsenSurVAEFlowsSurjections2020}, source_type = {article}, short_title = {SurVAE Flows}, notes = {arXiv: 2007.02731}, private_publication = {false}, abstract = {Normalizing flows and variational autoencoders are powerful generative models that can represent complicated density functions. However, they both impose constraints on the models: Normalizing flows use bijective transformations to model densities whereas VAEs learn stochastic transformations that are non-invertible and thus typically do not provide tractable estimates of the marginal likelihood. In this paper, we introduce SurVAE Flows: A modular framework of composable transformations that encompasses VAEs and normalizing flows. SurVAE Flows bridge the gap between normalizing flows and VAEs with surjective transformations, wherein the transformations are deterministic in one direction -- thereby allowing exact likelihood computation, and stochastic in the reverse direction -- hence providing a lower bound on the corresponding likelihood. We show that several recently proposed methods, including dequantization and augmented normalizing flows, can be expressed as SurVAE Flows. Finally, we introduce common operations such as the max value, the absolute value, sorting and stochastic permutation as composable layers in SurVAE Flows.}, bibtype = {article}, author = {Nielsen, Didrik and Jaini, Priyank and Hoogeboom, Emiel and Winther, Ole and Welling, Max}, journal = {arXiv:2007.02731 [cs, stat]} }
@article{ title = {Skeleton-aware networks for deep motion retargeting}, type = {article}, year = {2020}, keywords = {motion retargeting,neural motion processing}, pages = {62:62:1--62:62:14}, volume = {39}, websites = {https://doi.org/10.1145/3386569.3392462}, month = {7}, id = {bdac7449-a3a7-3f7a-9ea9-5e6c0625382c}, created = {2022-03-28T09:45:06.201Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:09.736Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {abermanSkeletonawareNetworksDeep2020}, source_type = {article}, private_publication = {false}, abstract = {We introduce a novel deep learning framework for data-driven motion retargeting between skeletons, which may have different structure, yet corresponding to homeomorphic graphs. Importantly, our approach learns how to retarget without requiring any explicit pairing between the motions in the training set. We leverage the fact that different homeomorphic skeletons may be reduced to a common primal skeleton by a sequence of edge merging operations, which we refer to as skeletal pooling. Thus, our main technical contribution is the introduction of novel differentiable convolution, pooling, and unpooling operators. These operators are skeleton-aware, meaning that they explicitly account for the skeleton's hierarchical structure and joint adjacency, and together they serve to transform the original motion into a collection of deep temporal features associated with the joints of the primal skeleton. In other words, our operators form the building blocks of a new deep motion processing framework that embeds the motion into a common latent space, shared by a collection of homeomorphic skeletons. Thus, retargeting can be achieved simply by encoding to, and decoding from this latent space. Our experiments show the effectiveness of our framework for motion retargeting, as well as motion processing in general, compared to existing approaches. Our approach is also quantitatively evaluated on a synthetic dataset that contains pairs of motions applied to different skeletons. To the best of our knowledge, our method is the first to perform retargeting between skeletons with differently sampled kinematic chains, without any paired examples.}, bibtype = {article}, author = {Aberman, Kfir and Li, Peizhuo and Lischinski, Dani and Sorkine-Hornung, Olga and Cohen-Or, Daniel and Chen, Baoquan}, doi = {10.1145/3386569.3392462}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {S2DNet: Learning Accurate Correspondences for Sparse-to-Dense Feature Matching}, type = {article}, year = {2020}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/2004.01673}, month = {4}, id = {e63e32c1-63e6-3668-9d6a-f1745573af59}, created = {2022-03-28T09:45:06.461Z}, accessed = {2022-01-03}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:36.087Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {germainS2DNetLearningAccurate2020}, source_type = {article}, short_title = {S2DNet}, notes = {arXiv: 2004.01673}, private_publication = {false}, abstract = {Establishing robust and accurate correspondences is a fundamental backbone to many computer vision algorithms. While recent learning-based feature matching methods have shown promising results in providing robust correspondences under challenging conditions, they are often limited in terms of precision. In this paper, we introduce S2DNet, a novel feature matching pipeline, designed and trained to efficiently establish both robust and accurate correspondences. By leveraging a sparse-to-dense matching paradigm, we cast the correspondence learning problem as a supervised classification task to learn to output highly peaked correspondence maps. We show that S2DNet achieves state-of-the-art results on the HPatches benchmark, as well as on several long-term visual localization datasets.}, bibtype = {article}, author = {Germain, Hugo and Bourmaud, Guillaume and Lepetit, Vincent}, journal = {arXiv:2004.01673 [cs]} }
@article{ title = {Learning to Reconstruct and Segment 3D Objects}, type = {article}, year = {2020}, id = {f97c093e-6917-3b68-bc80-cf8fe6c84a9f}, created = {2022-05-02T08:14:58.358Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:14.578Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Yang, Bo} }
@article{ title = {PointCutMix : Regularization Strategy for Point Cloud Classification}, type = {article}, year = {2020}, id = {cb3f8da0-4e26-3fa8-8d86-d263bd8a3eef}, created = {2022-05-02T08:14:58.393Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:16.306Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Zhang, Jinlai and Chen, Lyujie and Ouyang, Bo and Liu, Binbin and Zhu, Jihong and Chen, Yujin and Meng, Yanmei} }
@article{ title = {Online LiDAR-SLAM for Legged Robots with Robust Registration and Deep-Learned Loop Closure}, type = {article}, year = {2020}, pages = {4158-4164}, websites = {https://arxiv.org/abs/2001.10249v1}, month = {1}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {28}, id = {eff5ab3e-fa9a-367d-9486-d60a5f05b885}, created = {2022-06-07T04:53:04.745Z}, accessed = {2022-06-07}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-07T04:53:09.306Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {In this paper, we present a factor-graph LiDAR-SLAM system which incorporates a state-of-the-art deeply learned feature-based loop closure detector to enable a legged robot to localize and map in industrial environments. These facilities can be badly lit and comprised of indistinct metallic structures, thus our system uses only LiDAR sensing and was developed to run on the quadruped robot's navigation PC. Point clouds are accumulated using an inertial-kinematic state estimator before being aligned using ICP registration. To close loops we use a loop proposal mechanism which matches individual segments between clouds. We trained a descriptor offline to match these segments. The efficiency of our method comes from carefully designing the network architecture to minimize the number of parameters such that this deep learning method can be deployed in real-time using only the CPU of a legged robot, a major contribution of this work. The set of odometry and loop closure factors are updated using pose graph optimization. Finally we present an efficient risk alignment prediction method which verifies the reliability of the registrations. Experimental results at an industrial facility demonstrated the robustness and flexibility of our system, including autonomous following paths derived from the SLAM map.}, bibtype = {article}, author = {Ramezani, Milad and Tinchev, Georgi and Iuganov, Egor and Fallon, Maurice}, doi = {10.48550/arxiv.2001.10249}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {ENDOWING DEEP 3D MODELS WITH ROTATION INVARIANCE BASED ON PRINCIPAL COMPONENT ANALYSIS School of Data and Computer Science , Sun Yat-sen University , Guangzhou , China}, type = {article}, year = {2020}, id = {05e66cf3-0ae0-30f8-957a-748000c543a0}, created = {2022-06-21T09:01:27.597Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-22T07:14:25.200Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5177186d-8a3a-482d-a36a-536b8090101c}, private_publication = {false}, bibtype = {article}, author = {} }
@article{ title = {Rotation-invariant local-to-global representation learning for 3D point cloud}, type = {article}, year = {2020}, volume = {2020-Decem}, id = {8371c94c-82b4-3b2b-84b4-2be92b9c373c}, created = {2022-07-28T12:39:24.657Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:36.691Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {We propose a local-to-global representation learning algorithm for 3D point cloud data, which is appropriate for handling various geometric transformations, especially rotation, without explicit data augmentation with respect to the transformations. Our model takes advantage of multi-level abstraction based on graph convolutional neural networks, which constructs a descriptor hierarchy to encode rotation-invariant shape information of an input object in a bottom-up manner. The descriptors in each level are obtained from a neural network based on a graph via stochastic sampling of 3D points, which is effective in making the learned representations robust to the variations of input data. The proposed algorithm presents the state-of-the-art performance on the rotation-augmented 3D object recognition and segmentation benchmarks. We further analyze its characteristics through comprehensive ablative experiments.}, bibtype = {article}, author = {Kim, Seohyun and Park, Jaeyoo and Han, Bohyung}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS} }
@article{ title = {Learning 3D semantic scene graphs from 3D indoor reconstructions}, type = {article}, year = {2020}, pages = {3960-3969}, id = {a154c065-f33c-3efa-8c65-027b149a2dc6}, created = {2022-08-18T10:53:48.468Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:14.607Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Scene understanding has been of high interest in computer vision. It encompasses not only identifying objects in a scene, but also their relationships within the given context. With this goal, a recent line of works tackles 3D semantic segmentation and scene layout prediction. In our work we focus on scene graphs, a data structure that organizes the entities of a scene in a graph, where objects are nodes and their relationships modeled as edges. We leverage inference on scene graphs as a way to carry out 3D scene understanding, mapping objects and their relationships. In particular, we propose a learned method that regresses a scene graph from the point cloud of a scene. Our novel architecture is based on PointNet and Graph Convolutional Networks (GCN). In addition, we introduce 3DSSG, a semi-automatically generated dataset, that contains semantically rich scene graphs of 3D scenes. We show the application of our method in a domain-agnostic retrieval task, where graphs serve as an intermediate representation for 3D-3D and 2D-3D matching.}, bibtype = {article}, author = {Wald, Johanna and Dhamo, Helisa and Navab, Nassir and Tombari, Federico}, doi = {10.1109/CVPR42600.2020.00402}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Learning to segment 3D point clouds in 2D image space}, type = {article}, year = {2020}, pages = {12252-12261}, id = {909e573b-55b4-3f5c-b1f5-7c411e750613}, created = {2022-08-18T10:53:48.735Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:40.207Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In contrast to the literature where local patterns in 3D point clouds are captured by customized convolutional operators, in this paper we study the problem of how to effectively and efficiently project such point clouds into a 2D image space so that traditional 2D convolutional neural networks (CNNs) such as U-Net can be applied for segmentation. To this end, we are motivated by graph drawing and reformulate it as an integer programming problem to learn the topology-preserving graph-to-grid mapping for each individual point cloud. To accelerate the computation in practice, we further propose a novel hierarchical approximate algorithm. With the help of the Delaunay triangulation for graph construction from point clouds and a multi-scale U-Net for segmentation, we manage to demonstrate the state-of-the-art performance on ShapeNet and PartNet, respectively, with significant improvement over the literature. Code is available at https://github.com/Zhang-VISLab.}, bibtype = {article}, author = {Lyu, Yecheng and Huang, Xinming and Zhang, Ziming}, doi = {10.1109/CVPR42600.2020.01227}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Spectral-GANs for High-Resolution 3D Point-cloud Generation}, type = {article}, year = {2020}, pages = {8169-8176}, id = {a1c3b676-46fd-3370-8457-8152d05918b7}, created = {2022-08-19T12:56:19.269Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:56.924Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ramasinghe2020}, folder_uuids = {255910b9-b737-4c31-858e-6de1dca0cdb9,4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d,8410cff5-f764-42b2-a9b5-ead8d2dee5c8,93ad337d-2e3c-4f22-a550-b54f5edeaaa5,8168419f-bb4b-4e96-bdd8-802bb0712103,244f8db2-6bd4-47d9-8abf-425a263fd4d1}, private_publication = {false}, bibtype = {article}, author = {Ramasinghe, Sameera and Khan, Salman and Barnes, Nick and Gould, Stephen} }
@article{ title = {Point Cloud Denoising via Feature Graph Laplacian Regularization}, type = {article}, year = {2020}, pages = {4143-4158}, volume = {29}, id = {05bde669-9fbd-31cf-9a48-7fa6f6fd452c}, created = {2022-09-06T14:02:41.609Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:08.232Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {93ad337d-2e3c-4f22-a550-b54f5edeaaa5,244f8db2-6bd4-47d9-8abf-425a263fd4d1}, private_publication = {false}, bibtype = {article}, author = {Dinesh, Chinthaka and Member, Student and Cheung, Gene and Member, Senior and Baji, Ivan V} }
@article{ title = {Computing the Testing Error without a Testing Set}, type = {article}, year = {2020}, pages = {2674-2682}, id = {73edde14-211e-3c20-8c81-dfaf6ef0a3a1}, created = {2022-09-08T10:49:09.765Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T10:52:38.210Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {034ae31f-a548-45de-8507-3cbbc9e326ad}, private_publication = {false}, abstract = {Deep Neural Networks (DNNs) have revolutionized computer vision. We now have DNNs that achieve top (accuracy) results in many problems, including object recognition, facial expression analysis, and semantic segmentation, to name but a few. The design of the DNNs that achieve top results is, however, non-trivial and mostly done by trail-and-error. That is, typically, researchers will derive many DNN architectures (ie, topologies) and then test them on multiple datasets. However, there are no guarantees that the selected DNN will perform well in the real world. One can use a testing set to estimate the performance gap between the training and testing sets, but avoiding overfitting-to-the-testing-data is of concern. Using a sequestered testing data may address this problem, but this requires a constant update of the dataset, a very expensive venture. Here, we derive an algorithm to estimate the performance gap between training and testing without the need of a testing dataset. Specifically, we derive a set of persistent topology measures that identify when a DNN is learning to generalize to unseen samples. We provide extensive experimental validation on multiple networks and datasets to demonstrate the feasibility of the proposed approach.}, bibtype = {article}, author = {Corneanu, Ciprian A. and Escalera, Sergio and Martinez, Aleix M.}, doi = {10.1109/CVPR42600.2020.00275}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Supervised learning of the next-best-view for 3d object reconstruction}, type = {article}, year = {2020}, keywords = {3D reconstruction,3D-CNN,Next-best-view}, pages = {224-231}, volume = {133}, websites = {https://doi.org/10.1016/j.patrec.2020.02.024}, publisher = {Elsevier B.V.}, id = {0793a34f-4674-3c5b-a70b-c4cd5edfbf1b}, created = {2022-09-08T11:24:59.056Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:21.584Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a64087a-cd32-494a-8140-2abf0b1356c6}, private_publication = {false}, abstract = {Motivated by the advances in 3D sensing technology and the spreading of low-cost robotic platforms, 3D object reconstruction has become a common task in many areas. Nevertheless, the selection of the optimal sensor pose that maximizes the reconstructed surface is a problem that remains open. It is known in the literature as the next-best-view planning problem. In this paper, we propose a novel next-best-view planning scheme based on supervised deep learning. The scheme contains an algorithm for automatic generation of datasets and an original three-dimensional convolutional neural network (3D-CNN) used to learn the next-best-view. Unlike previous work where the problem is addressed as a search, the trained 3D-CNN directly predicts the sensor pose. We present an experimental comparison of the proposed architecture against two alternative networks; we also compare it with state-of-the-art next-best-view methods in the reconstruction of several unknown objects. Our method is faster and reaches high coverage.}, bibtype = {article}, author = {Mendoza, Miguel and Vasquez-Gomez, J. Irving and Taud, Hind and Sucar, L. Enrique and Reta, Carolina}, doi = {10.1016/j.patrec.2020.02.024}, journal = {Pattern Recognition Letters} }
@article{ title = {PC-NBV: A point cloud based deep network for efficient next best view planning}, type = {article}, year = {2020}, pages = {7050-7057}, id = {6f6d3999-f5f6-3ec7-82bc-0ab1f4319399}, created = {2022-09-08T11:24:59.061Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:17.226Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a64087a-cd32-494a-8140-2abf0b1356c6,cea70a43-0c16-4f42-9103-67f888a966cd}, private_publication = {false}, abstract = {The Next Best View (NBV) problem is important in the active robotic reconstruction. It enables the robot system to perform scanning actions in a reasonable view sequence, and fulfil the reconstruction task in an effective way. Previous works mainly follow the volumetric methods, which convert the point cloud information collected by sensors into a voxel representation space and evaluate candidate views through ray casting simulations to pick the NBV. However, the process of volumetric data transformation and ray casting is often time-consuming. To address this issue, in this paper, we propose a point cloud based deep neural network called PC-NBV to achieve efficient view planning without these computationally expensive operations. The PC-NBV network takes the raw point cloud data and current view selection states as input, and then directly predicts the information gain of all candidate views. By avoiding costly data transformation and ray casting, and utilizing powerful neural network to learn structure priors from point cloud, our method can achieve efficient and effective NBV planning. Experiments on multiple datasets show the proposed method outperforms state-of-the-art NBV methods, giving better views for robot system with much less inference time. Furthermore, we demonstrate the robustness of our method against noise and the ability to extend to multi-view system, making it more applicable for various scenarios.}, bibtype = {article}, author = {Zeng, Rui and Zhao, Wang and Liu, Yong Jin}, doi = {10.1109/IROS45743.2020.9340916}, journal = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {Generative adversarial networks}, type = {article}, year = {2020}, pages = {139-144}, volume = {63}, id = {1cdcb471-bae8-3b1d-b916-cf3d64118469}, created = {2022-09-08T11:24:59.064Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:32.680Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Generative adversarial networks are a kind of artificial intelligence algorithm designed to solve the generative modeling problem. The goal of a generative model is to study a collection of training examples and learn the probability distribution that generated them. Generative Adversarial Networks (GANs) are then able to generate more examples from the estimated probability distribution. Generative models based on deep learning are common, but GANs are among the most successful generative models (especially in terms of their ability to generate realistic high-resolution images). GANs have been successfully applied to a wide variety of tasks (mostly in research settings) but continue to present unique challenges and research opportunities because they are based on game theory while most other approaches to generative modeling are based on optimization.}, bibtype = {article}, author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, doi = {10.1145/3422622}, journal = {Communications of the ACM}, number = {11} }
@article{ title = {PointGrow: Autoregressively learned point cloud generation with self-attention}, type = {article}, year = {2020}, pages = {61-70}, id = {fb0ed54f-5507-31d4-8f5d-4fa72fe18720}, created = {2022-09-08T17:25:32.348Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T13:57:40.696Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,8168419f-bb4b-4e96-bdd8-802bb0712103}, private_publication = {false}, abstract = {Generating 3D point clouds is challenging yet highly desired. This work presents a novel autoregressive model, PointGrow, which can generate diverse and realistic point cloud samples from scratch or conditioned on semantic contexts. This model operates recurrently, with each point sampled according to a conditional distribution given its previously-generated points, allowing inter-point correlations to be well-exploited and 3D shape generative processes to be better interpreted. Since point cloud object shapes are typically encoded by long-range dependencies, we augment our model with dedicated self-attention modules to capture such relations. Extensive evaluations show that PointGrow achieves satisfying performance on both unconditional and conditional point cloud generation tasks, with respect to realism and diversity. Several important applications, such as unsupervised feature learning and shape arithmetic operations, are also demonstrated.}, bibtype = {article}, author = {Sun, Yongbin and Wang, Yue and Liu, Ziwei and Siegel, Joshua E. and Sarma, Sanjay E.}, doi = {10.1109/WACV45572.2020.9093430}, journal = {Proceedings - 2020 IEEE Winter Conference on Applications of Computer Vision, WACV 2020} }
@article{ title = {A Progressive Conditional Generative Adversarial Network for Generating Dense and Colored 3D Point Clouds}, type = {article}, year = {2020}, pages = {712-722}, id = {737832dc-5350-3be3-abc2-ba4449e350de}, created = {2022-09-19T11:54:47.627Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:10.419Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {255910b9-b737-4c31-858e-6de1dca0cdb9,b6d75013-efe2-4ddc-b3db-65496bd4db9f,8168419f-bb4b-4e96-bdd8-802bb0712103}, private_publication = {false}, abstract = {In this paper, we introduce a novel conditional generative adversarial network that creates dense 3D point clouds, with color, for assorted classes of objects in an unsupervised manner. To overcome the difficulty of capturing intricate details at high resolutions, we propose a point transformer that progressively grows the network through the use of graph convolutions. The network is composed of a leaf output layer and an initial set of branches. Every training iteration evolves a point vector into a point cloud of increasing resolution. After a fixed number of iterations, the number of branches is increased by replicating the last branch. Experimental results show that our network is capable of learning and mimicking a 3D data distribution, and produces colored point clouds with fine details at multiple resolutions.}, bibtype = {article}, author = {Arshad, Mohammad Samiul and Beksi, William J.}, doi = {10.1109/3DV50981.2020.00081}, journal = {Proceedings - 2020 International Conference on 3D Vision, 3DV 2020} }
@article{ title = {PF-Net: Point fractal network for 3D point cloud completion}, type = {article}, year = {2020}, pages = {7659-7667}, volume = {6}, id = {a2d4901f-b7ad-38a8-954e-03e200e43acb}, created = {2022-10-03T13:31:09.955Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:18.021Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {In this paper, we propose a Point Fractal Network (PF-Net), a novel learning-based approach for precise and high-fidelity point cloud completion. Unlike existing point cloud completion networks, which generate the overall shape of the point cloud from the incomplete point cloud and always change existing points and encounter noise and geometrical loss, PF-Net preserves the spatial arrangements of the incomplete point cloud and can figure out the detailed geometrical structure of the missing region(s) in the prediction. To succeed at this task, PF-Net estimates the missing point cloud hierarchically by utilizing a feature-points-based multi-scale generating network. Further, we add up multi-stage completion loss and adversarial loss to generate more realistic missing region(s). The adversarial loss can better tackle multiple modes in the prediction. Our experiments demonstrate the effectiveness of our method for several challenging point cloud completion tasks.}, bibtype = {article}, author = {Huang, Zitian and Yu, Yikuan and Xu, Jiawen and Ni, Feng and Le, Xinyi}, doi = {10.1109/CVPR42600.2020.00768}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, number = {3} }
@article{ title = {Representation Learning on Unit Ball with 3D Roto-translational Equivariance}, type = {article}, year = {2020}, keywords = {3D moments,Convolution neural networks,Deep learning,Volumetric convolution,Zernike polynomials}, pages = {1612-1634}, volume = {128}, websites = {https://doi.org/10.1007/s11263-019-01278-x}, publisher = {Springer US}, id = {8ec46735-939d-3a85-a006-f97832ba2f10}, created = {2022-10-10T13:41:15.051Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-10T13:41:23.481Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8410cff5-f764-42b2-a9b5-ead8d2dee5c8}, private_publication = {false}, abstract = {Convolution is an integral operation that defines how the shape of one function is modified by another function. This powerful concept forms the basis of hierarchical feature learning in deep neural networks. Although performing convolution in Euclidean geometries is fairly straightforward, its extension to other topological spaces—such as a sphere (S2) or a unit ball (B3)—entails unique challenges. In this work, we propose a novel ‘volumetric convolution’ operation that can effectively model and convolve arbitrary functions in B3. We develop a theoretical framework for volumetric convolution based on Zernike polynomials and efficiently implement it as a differentiable and an easily pluggable layer in deep networks. By construction, our formulation leads to the derivation of a novel formula to measure the symmetry of a function in B3 around an arbitrary axis, that is useful in function analysis tasks. We demonstrate the efficacy of proposed volumetric convolution operation on one viable use case i.e., 3D object recognition.}, bibtype = {article}, author = {Ramasinghe, Sameera and Khan, Salman and Barnes, Nick and Gould, Stephen}, doi = {10.1007/s11263-019-01278-x}, journal = {International Journal of Computer Vision}, number = {6} }
@article{ title = {Spherical harmonic energy over Gaussian sphere for incomplete 3D shape retrieval}, type = {article}, year = {2020}, keywords = {Engineering shape,Gaussian sphere model (GSM),Incomplete shape retrieval,Point cloud,Spherical harmonic (SH),Surface normal}, pages = {183117-183126}, volume = {8}, id = {8c2d4dbe-22f2-37cd-9ad8-bcbc01e32d9f}, created = {2023-05-03T13:16:38.936Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.228Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Li2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Spherical harmonic (SH) has shown excellent advantages in terms of accuracy and efficiency in the retrieval of complete three-dimensional shapes. However, since the spherical function directly takes the shape centroid as the global reference point, the SH features depend heavily on the central position. In this context, the features become no longer reliable in the query of incomplete shapes that may cause erratic centroid. In this work, we propose a novel shape descriptor, namely spherical harmonic energy over the Gaussian sphere (SHE-GS), especially for the incomplete shape retrieval. Firstly, all unit normal vectors on the shape surface are mapped to points on a Gaussian sphere, which has the constant center. Secondly, kernel density estimation is used to establish a Gaussian Sphere Model (GSM) to describe the density change of these mapping points. Finally, the shape descriptor is generated by applying an SH transformation on the model. According to the way of GSM being regarded as a surface model or a volume model, we have separately designed two specific algorithm implementations. Experimental results, on two engineering shape sets containing artificially defective shapes, indicate that the proposed method outperforms other traditional methods also defined in the sphere space for incomplete shape retrieval. The superiority is verified for both similar objects in the same category or the single specific object of query.}, bibtype = {article}, author = {Li, Jia and Li, Zikuan and Lin, Huan and Chen, Renxi and Lan, Qiuping}, doi = {10.1109/ACCESS.2020.3029103}, journal = {IEEE Access} }
@article{ title = {Learning SO(3) Equivariant Representations with Spherical CNNs}, type = {article}, year = {2020}, keywords = {3D vision,Equivariance,Sphere,Spherical CNN}, pages = {588-600}, volume = {128}, id = {702432bb-ef8e-3464-a533-73e724724a47}, created = {2023-05-03T13:16:38.940Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.349Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Esteves2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We address the problem of 3D rotation equivariance in convolutional neural networks. 3D rotations have been a challenging nuisance in 3D classification tasks requiring higher capacity and extended data augmentation in order to tackle it. We model 3D data with multi-valued spherical functions and we propose a novel spherical convolutional network that implements exact convolutions on the sphere by realizing them in the spherical harmonic domain. Resulting filters have local symmetry and are localized by enforcing smooth spectra. We apply a novel pooling on the spectral domain and our operations are independent of the underlying spherical resolution throughout the network. We show that networks with much lower capacity and without requiring data augmentation can exhibit performance comparable to the state of the art in standard 3D shape retrieval and classification benchmarks.}, bibtype = {article}, author = {Esteves, Carlos and Allen-Blanchette, Christine and Makadia, Ameesh and Daniilidis, Kostas}, doi = {10.1007/s11263-019-01220-1}, journal = {International Journal of Computer Vision}, number = {3} }
@article{ title = {On Isometry Robustness of Deep 3D Point Cloud Models under Adversarial Attacks}, type = {article}, year = {2020}, pages = {1198-1207}, id = {a1c3a669-6786-3ee2-8a2a-be602d084b53}, created = {2023-05-03T13:16:39.248Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.177Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhao2020a}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {While deep learning in 3D domain has achieved revolutionary performance in many tasks, the robustness of these models has not been sufficiently studied or explored. Regarding the 3D adversarial samples, most existing works focus on manipulation of local points, which may fail to invoke the global geometry properties, like robustness under linear projection that preserves the Euclidean distance, i.e., isometry. In this work, we show that existing state-of-the-art deep 3D models are extremely vulnerable to isometry transformations. Armed with the Thompson Sampling, we develop a black-box attack with success rate over 95% on ModelNet40 data set. Incorporating with the Restricted Isometry Property, we propose a novel framework of white-box attack on top of spectral norm based perturbation. In contrast to previous works, our adversarial samples are experimentally shown to be strongly transferable. Evaluated on a sequence of prevailing 3D models, our white-box attack achieves success rates from 98.88% to 100%. It maintains a successful attack rate over 95% even within an imperceptible rotation range [±2.81].}, bibtype = {article}, author = {Zhao, Yue and Wu, Yuwei and Chen, Caihua and Lim, Andrew and Chen, Caihua}, doi = {10.1109/CVPR42600.2020.00128}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Three-dimensional Simultaneous Shape and Pose Estimation for Extended Objects Using Spherical Harmonics}, type = {article}, year = {2020}, websites = {http://arxiv.org/abs/2012.13580}, id = {da246664-eaeb-318c-a9bc-d8dcfcfb1450}, created = {2023-05-03T13:16:39.678Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.749Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kurz2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We propose a new recursive method for simultaneous estimation of both the pose and the shape of a three-dimensional extended object. The key idea of the presented method is to represent the shape of the object using spherical harmonics, similar to the way Fourier series can be used in the two-dimensional case. This allows us to derive a measurement equation that can be used within the framework of nonlinear filters such as the UKF. We provide both simulative and experimental evaluations of the novel techniques.}, bibtype = {article}, author = {Kurz, Gerhard and Faion, Florian and Pfaff, Florian and Zea, Antonio and Hanebeck, Uwe D.} }
@article{ title = {RIDF: A Robust Rotation-Invariant Descriptor for 3D Point Cloud Registration in the Frequency Domain}, type = {article}, year = {2020}, keywords = {3D descriptor,Fourier analysis,Point cloud registration,Rotation-invariance}, pages = {235-242}, volume = {5}, id = {bc598811-a963-35ef-9d2b-3570f03fdf77}, created = {2023-05-03T13:16:40.149Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.536Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Huang2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Registration of point clouds is a fundamental problem in the community of photogrammetry and 3D computer vision. Generally, point cloud registration consists of two steps: The search of correspondences and the estimation of transformation parameters. However, to find correspondences from point clouds, generating robust and discriminative features is of necessity. In this paper, we address the problem of extracting robust rotation-invariant features for fast coarse registration of point clouds under the assumption that the pairwise point clouds are transformed with rigid transformation. With a Fourier-based descriptor, point clouds represented by volumetric images can be mapped from the image to feature space. It is achieved by considering a gradient histogram as a continuous angular signal which can be well represented by the spherical harmonics. The rotation-invariance is established based on the Fourier-based analysis, in which high-frequency signals can be filtered out. This makes the extracted features robust to noises and outliers. Then, with the extracted features, pairwise correspondence can be found by the fast search. Finally, the transformation parameters can be estimated by fitting the rigid transformation model using the corresponding points and RANSAC algorithm. Experiments are conducted to prove the effectiveness of our proposed method in the task of point cloud registration. Regarding the experimental results of the point cloud registration using two TLS benchmark point cloud datasets, featuring with limited overlaps and uneven point densities and covering different urban scenes, our proposed method can achieve a fast coarse registration with rotation errors of less than 1 degree and translation errors of less than 1m.}, bibtype = {article}, author = {Huang, R. and Yao, W. and Ye, Z. and Xu, Y. and Stilla, U.}, doi = {10.5194/isprs-annals-V-2-2020-235-2020}, journal = {ISPRS Annals of the Photogrammetry, Remote Sensing and Spatial Information Sciences}, number = {2} }
@article{ title = {WISH: Efficient 3D biological shape classification through Willmore flow and Spherical Harmonics decomposition}, type = {article}, year = {2020}, pages = {4184-4194}, volume = {2020-June}, id = {f7db74eb-aded-3b10-b045-ba19214fc3bd}, created = {2023-05-03T13:16:40.507Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.724Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Agus2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Shape analysis of cell nuclei, enabled by the recent advances in nano-scale digital imaging and reconstruction methods, is emerging as a very important tool to understand low-level biological processes. Current analysis techniques, however, are performed on 2D slices or assume very simple 3D shape approximations , limiting their discrimination capabilities. In this work, we introduce a compact rotation-invariant frequency-based representation of genus-0 3D shapes represented by manifold triangle meshes, that we apply to cell nuclei envelopes reconstructed from electron micrographs. The representation is robustly obtained through Spherical Harmonics coefficients over a spherical parameterization of the input mesh obtained through Willmore flow. Our results show how our method significantly improves the state-of-the-art in the classification of nuclear envelopes of rodent brain samples. Moreover, while our method is motivated by the analysis of specific biological shapes, the framework is of general use for the compact frequency encoding of any genus-0 surface.}, bibtype = {article}, author = {Agus, Marco and Gobbetti, Enrico and Pintore, Giovanni and Cali, Corrado and Schneider, Jens}, doi = {10.1109/CVPRW50498.2020.00494}, journal = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops} }
@article{ title = {On the Universality of Rotation Equivariant Point Cloud Networks}, type = {article}, year = {2020}, pages = {1-20}, websites = {http://arxiv.org/abs/2010.02449}, id = {320a47fa-ae13-37c8-b883-c38a6570ce87}, created = {2023-05-03T13:16:40.579Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.470Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Dym2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Learning functions on point clouds has applications in many fields, including computer vision, computer graphics, physics, and chemistry. Recently, there has been a growing interest in neural architectures that are invariant or equivariant to all three shape-preserving transformations of point clouds: translation, rotation, and permutation. In this paper, we present a first study of the approximation power of these architectures. We first derive two sufficient conditions for an equivariant architecture to have the universal approximation property, based on a novel characterization of the space of equivariant polynomials. We then use these conditions to show that two recently suggested models are universal, and for devising two other novel universal architectures.}, bibtype = {article}, author = {Dym, Nadav and Maron, Haggai} }
@article{ title = {Local rotation invariance in 3D CNNs}, type = {article}, year = {2020}, keywords = {3D Texture,Convolutional neural network,Local rotation invariance,Steerable filters}, volume = {65}, id = {559906f1-af6f-3999-993e-0bf926f73dec}, created = {2023-05-03T13:16:40.825Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.440Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Andrearczyk2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Locally Rotation Invariant (LRI) image analysis was shown to be fundamental in many applications and in particular in medical imaging where local structures of tissues occur at arbitrary rotations. LRI constituted the cornerstone of several breakthroughs in texture analysis, including Local Binary Patterns (LBP), Maximum Response 8 (MR8) and steerable filterbanks. Whereas globally rotation invariant Convolutional Neural Networks (CNN) were recently proposed, LRI was very little investigated in the context of deep learning. LRI designs allow learning filters accounting for all orientations, which enables a drastic reduction of trainable parameters and training data when compared to standard 3D CNNs. In this paper, we propose and compare several methods to obtain LRI CNNs with directional sensitivity. Two methods use orientation channels (responses to rotated kernels), either by explicitly rotating the kernels or using steerable filters. These orientation channels constitute a locally rotation equivariant representation of the data. Local pooling across orientations yields LRI image analysis. Steerable filters are used to achieve a fine and efficient sampling of 3D rotations as well as a reduction of trainable parameters and operations, thanks to a parametric representations involving solid Spherical Harmonics (SH),which are products of SH with associated learned radial profiles. Finally, we investigate a third strategy to obtain LRI based on rotational invariants calculated from responses to a learned set of solid SHs. The proposed methods are evaluated and compared to standard CNNs on 3D datasets including synthetic textured volumes composed of rotated patterns, and pulmonary nodule classification in CT. The results show the importance of LRI image analysis while resulting in a drastic reduction of trainable parameters, outperforming standard 3D CNNs trained with rotational data augmentation.}, bibtype = {article}, author = {Andrearczyk, Vincent and Fageot, Julien and Oreiller, Valentin and Montet, Xavier and Depeursinge, Adrien}, doi = {10.1016/j.media.2020.101756}, journal = {Medical Image Analysis} }
@article{ title = {PointAR: Efficient Lighting Estimation for Mobile Augmented Reality}, type = {article}, year = {2020}, keywords = {Deep learning,Lighting estimation,Mobile AR}, pages = {678-693}, volume = {12368 LNCS}, id = {e4d429b9-2db4-3b1b-8c90-497fde4ec1d0}, created = {2023-05-03T13:16:40.876Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.774Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhao2020}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We propose an efficient lighting estimation pipeline that is suitable to run on modern mobile devices, with comparable resource complexities to state-of-the-art mobile deep learning models. Our pipeline, PointAR, takes a single RGB-D image captured from the mobile camera and a 2D location in that image, and estimates 2nd degree spherical harmonics coefficients. This estimated spherical harmonics coefficients can be directly utilized by rendering engines for supporting spatially variant indoor lighting, in the context of augmented reality. Our key insight is to formulate the lighting estimation as a point cloud-based learning problem directly from point clouds, which is in part inspired by the Monte Carlo integration leveraged by real-time spherical harmonics lighting. While existing approaches estimate lighting information with complex deep learning pipelines, our method focuses on reducing the computational complexity. Through both quantitative and qualitative experiments, we demonstrate that PointAR achieves lower lighting estimation errors compared to state-of-the-art methods. Further, our method requires an order of magnitude lower resource, comparable to that of mobile-specific DNNs.}, bibtype = {article}, author = {Zhao, Yiqin and Guo, Tian}, doi = {10.1007/978-3-030-58592-1_40}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Beyond Peak Performance: Comparing the Real Performance of AI-Optimized FPGAs and GPUs}, type = {inproceedings}, year = {2020}, keywords = {Deep Learning,FPGA,GPU,Neural Networks}, pages = {10-19}, month = {12}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {1}, id = {a249cfcd-8d30-3272-9526-6c5dcdc23628}, created = {2023-11-07T10:04:22.591Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:15:55.611Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {The growing importance and compute demands of artificial intelligence (AI) have led to the emergence of domain-optimized hardware platforms. For example, Nvidia GPUs introduced specialized tensor cores for matrix operations to speed up deep learning (DL) computation, resulting in very high peak throughput up to 130 int8 TOPS in the T4 GPU. Recently, Intel introduced its first AI-optimized 14nm FPGA, the Stratix 10 NX, with in-fabric AI tensor blocks that offer estimated peak performance up to 143 int8 TOPS, comparable to 12nm GPUs. However, what matters in practice is not the peak performance but the actual achievable performance on target workloads. This depends mainly on the utilization of the tensor units, and the system-level overheads to send data to/from the accelerator. This paper presents the first performance evaluation of Intel's AI-optimized FPGA, the Stratix 10 NX, in comparison to the latest accessible AI-optimized GPUs, the Nvidia T4 and V100, on a large suite of real-Time DL inference workloads. We enhance a re-implementation of the Brainwave NPU overlay architecture to utilize the FPGA's AI tensor blocks, and develop toolchain support that allows users to program tensor blocks purely through software, without FPGA EDA tools in the loop. We first compare the Stratix 10 NX NPU against Stratix 10 GX/MX versions with no tensor blocks, and then present detailed core compute and system-level performance comparisons to the T4 and V100 GPUs. We show that our enhanced NPU on Stratix 10 NX achieves better tensor block utilization than GPUs, resulting in 24× and 12× average compute speedups over the T4 and V100 GPUs at batch-6. Even with relaxed latency constraints that allow a batch size of 32, we still achieve average speedups of 5× and 2× against T4 and V100 GPUs, respectively. On a system-level, the FPGA's fine-grained flexibility with its integrated 100 Gbps Ethernet allows for remote access at 10× and 2× less system overhead latency than local access to a V100 GPU via 128 Gbps PCIe for short and long sequence RNNs, respectively.}, bibtype = {inproceedings}, author = {Boutros, Andrew and Nurvitadhi, Eriko and Ma, Rui and Gribok, Sergey and Zhao, Zhipeng and Hoe, James C. and Betz, Vaughn and Langhammer, Martin}, doi = {10.1109/ICFPT51103.2020.00011}, booktitle = {Proceedings - 2020 International Conference on Field-Programmable Technology, ICFPT 2020} }
@book{ title = {2020 IEEE Canadian Conference on Electrical and Computer Engineering (CCECE)}, type = {book}, year = {2020}, publisher = {IEEE}, id = {b4a6165c-0e5e-367d-b186-cc5aeac7715f}, created = {2023-11-07T10:04:29.141Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-01-09T14:34:42.396Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {<b>Novel Casestudy and Benchmarking of AlexNet for</b><br/><b>Edge AI: From CPU and GPU to FPGA</b><br/><br/>CNN requires parallesism – traditionally runed on GPU<br/>FPGA more suited for edge computing – low power consumption, high throughput and low latency<br/><br/>Use AlexNet, first CNN model, explains basics about CNNs<br/>GPUs great for training the model – fastest, not great for inference of edge devices<br/><br/>Theano and Lasagne libraries used for running inference<br/>Comparison between Intel core i5-6400 vs Nvidia GeForce GTX 960<br/>FGPA used for testing: Xilinx PYNQ-Z1; Vivado for buiding the IP<br/>Found FPGA to run inference slightly faster than Nvidia GPU and with a much smaller power drawn}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Title from content provider.}, bibtype = {book}, author = {} }
@article{ title = {Supervised fitting of geometric primitives to 3D point clouds}, type = {article}, year = {2019}, keywords = {Deep Learning,Grouping and Shape,Segmentation,Vision + Graphics}, pages = {2647-2655}, volume = {2019-June}, id = {a021728c-ff35-3aef-8710-e98e425ae596}, created = {2020-09-14T08:14:53.491Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T07:56:39.136Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,a89f4866-a7e8-4ea9-aa98-e3f470892f7c,2a0475f2-facb-4360-917f-00c5f8541f47,471f331d-8560-4b9e-b910-e5f849b6fcfd}, private_publication = {false}, abstract = {Fitting geometric primitives to 3D point cloud data bridges a gap between low-level digitized 3D data and high-level structural information on the underlying 3D shapes. As such, it enables many downstream applications in 3D data processing. For a long time, RANSAC-based methods have been the gold standard for such primitive fitting problems, but they require careful per-input parameter tuning and thus do not scale well for large datasets with diverse shapes. In this work, we introduce Supervised Primitive Fitting Network (SPFN), an end-to-end neural network that can robustly detect a varying number of primitives at different scales without any user control. The network is supervised using ground truth primitive surfaces and primitive membership for the input points. Instead of directly predicting the primitives, our architecture first predicts per-point properties and then uses a differential model estimation module to compute the primitive type and parameters. We evaluate our approach on a novel benchmark of ANSI 3D mechanical component models and demonstrate a significant improvement over both the state-of-the-art RANSAC-based methods and the direct neural prediction.}, bibtype = {article}, author = {Li, Lingxiao and Sung, Minhyuk and Dubrovina, Anastasia and Yi, Li and Guibas, Leonidas J.}, doi = {10.1109/CVPR.2019.00276}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, number = {Figure 1} }
@article{ title = {Unsupervised primitive discovery for improved 3D generative modeling}, type = {article}, year = {2019}, keywords = {3D from Single Image,Deep Learning,Image and Video Synthesis}, pages = {9731-9740}, volume = {2019-June}, id = {3b989a3c-f24b-31fc-97e1-ec18a8ce1496}, created = {2020-09-14T08:14:53.497Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-09-14T08:34:43.410Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,8d18e62e-6e66-4acb-ae6a-b470435041d8}, private_publication = {false}, abstract = {3D shape generation is a challenging problem due to the high-dimensional output space and complex part configurations of real-world objects. As a result, existing algorithms experience difficulties in accurate generative modeling of 3D shapes. Here, we propose a novel factorized generative model for 3D shape generation that sequentially transitions from coarse to fine scale shape generation. To this end, we introduce an unsupervised primitive discovery algorithm based on a higher-order conditional random field model. Using the primitive parts for shapes as attributes, a parameterized 3D representation is modeled in the first stage. This representation is further refined in the next stage by adding fine scale details to shape. Our results demonstrate improved representation ability of the generative model and better quality samples of newly generated 3D shapes. Further, our primitive generation approach can accurately parse common objects into a simplified representation.}, bibtype = {article}, author = {Khan, Salman H. and Guo, Yulan and Hayat, Munawar and Barnes, Nick}, doi = {10.1109/CVPR.2019.00997}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Portable system for box volume measurement based on line-structured light vision and deep learning}, type = {article}, year = {2019}, keywords = {Deep learning,Edge detection,Line-structured light,Volume measurement}, volume = {19}, id = {6ddbab96-e087-370c-a852-eb6601608247}, created = {2020-09-14T08:14:53.838Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T14:53:37.141Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, abstract = {Portable box volume measurement has always been a popular issue in the intelligent logistic industry. This work presents a portable system for box volume measurement that is based on line-structured light vision and deep learning. This system consists of a novel 2 × 2 laser line grid projector, a sensor, and software modules, with which only two laser-modulated images of boxes are required for volume measurement. For laser-modulated images, a novel end-to-end deep learning model is proposed by using an improved holistically nested edge detection network to extract edges. Furthermore, an automatic one-step calibration method for the line-structured light projector is designed for fast calibration. The experimental results show that the measuring range of our proposed system is 100–1800 mm, with errors less than ±5.0 mm. Theoretical analysis indicates that within the measuring range of the system, the measurement uncertainty of the measuring device is ±0.52 mm to ±4.0 mm, which is consistent with the experimental results. The device size is 140 mm × 35 mm × 35 mm and the weight is 110 g, thus the system is suitable for portable automatic box volume measurement.}, bibtype = {article}, author = {Peng, Tao and Zhang, Zhijiang and Song, Yingjie and Chen, Fansheng and Zeng, Dan}, doi = {10.3390/s19183921}, journal = {Sensors (Switzerland)}, number = {18} }
@article{ title = {R2D2: Repeatable and Reliable Detector and Descriptor}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1906.06195}, id = {6da400c6-a5c5-3055-8140-768b155273ef}, created = {2020-09-14T08:34:41.629Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-09-14T08:34:55.924Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Interest point detection and local feature description are fundamental steps in many computer vision applications. Classical methods for these tasks are based on a detect-then-describe paradigm where separate handcrafted methods are used to first identify repeatable keypoints and then represent them with a local descriptor. Neural networks trained with metric learning losses have recently caught up with these techniques, focusing on learning repeatable saliency maps for keypoint detection and learning descriptors at the detected keypoint locations. In this work, we argue that salient regions are not necessarily discriminative, and therefore can harm the performance of the description. Furthermore, we claim that descriptors should be learned only in regions for which matching can be performed with high confidence. We thus propose to jointly learn keypoint detection and description together with a predictor of the local descriptor discriminativeness. This allows us to avoid ambiguous areas and leads to reliable keypoint detections and descriptions. Our detection-and-description approach, trained with self-supervision, can simultaneously output sparse, repeatable and reliable keypoints that outperforms state-of-the-art detectors and descriptors on the HPatches dataset. It also establishes a record on the recently released Aachen Day-Night localization dataset.}, bibtype = {article}, author = {Revaud, Jerome and Weinzaepfel, Philippe and De Souza, César and Pion, Noe and Csurka, Gabriela and Cabon, Yohann and Humenberger, Martin}, number = {NeurIPS} }
@article{ title = {Deep learning for multi-path error removal in tof sensors}, type = {article}, year = {2019}, keywords = {Convolutional Neural Networks,Denoising,Depth acquisition,Multi-path interference,ToF sensors}, pages = {410-426}, volume = {11131 LNCS}, id = {98e37398-82ad-3d40-b7eb-aa64b561e246}, created = {2020-09-14T10:49:26.365Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-09-15T13:17:04.519Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {The removal of Multi-Path Interference (MPI) is one of the major open challenges in depth estimation with Time-of-Flight (ToF) cameras. In this paper we propose a novel method for MPI removal and depth refinement exploiting an ad-hoc deep learning architecture working on data from a multi-frequency ToF camera. In order to estimate the MPI we use a Convolutional Neural Network (CNN) made of two sub-networks: a coarse network analyzing the global structure of the data at a lower resolution and a fine one exploiting the output of the coarse network in order to remove the MPI while preserving the small details. The critical issue of the lack of ToF data with ground truth is solved by training the CNN with synthetic information. Finally, the residual zero-mean error is removed with an adaptive bilateral filter guided from a noise model for the camera. Experimental results prove the effectiveness of the proposed approach on both synthetic and real data.}, bibtype = {article}, author = {Agresti, Gianluca and Zanuttigh, Pietro}, doi = {10.1007/978-3-030-11015-4_30}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Deep learning for multi-path error removal in tof sensors}, type = {inproceedings}, year = {2019}, keywords = {Convolutional Neural Networks,Denoising,Depth acquisition,Multi-path interference,ToF sensors}, id = {1de77a33-e79e-3007-b2e8-6a0b7451c9dc}, created = {2020-10-01T06:44:41.663Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:51.561Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {The removal of Multi-Path Interference (MPI) is one of the major open challenges in depth estimation with Time-of-Flight (ToF) cameras. In this paper we propose a novel method for MPI removal and depth refinement exploiting an ad-hoc deep learning architecture working on data from a multi-frequency ToF camera. In order to estimate the MPI we use a Convolutional Neural Network (CNN) made of two sub-networks: a coarse network analyzing the global structure of the data at a lower resolution and a fine one exploiting the output of the coarse network in order to remove the MPI while preserving the small details. The critical issue of the lack of ToF data with ground truth is solved by training the CNN with synthetic information. Finally, the residual zero-mean error is removed with an adaptive bilateral filter guided from a noise model for the camera. Experimental results prove the effectiveness of the proposed approach on both synthetic and real data.}, bibtype = {inproceedings}, author = {Agresti, Gianluca and Zanuttigh, Pietro}, doi = {10.1007/978-3-030-11015-4_30}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Deep Learning for 3D Point Clouds: A Survey}, type = {article}, year = {2019}, pages = {1-24}, websites = {http://arxiv.org/abs/1912.12033}, id = {b64df2c1-d81c-3874-a6e5-a7829cfca659}, created = {2020-10-01T13:19:39.072Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T09:18:47.003Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {Point cloud learning has lately attracted increasing attention due to its wide applications in many areas, such as computer vision, autonomous driving, and robotics. As a dominating technique in AI, deep learning has been successfully used to solve various 2D vision problems. However, deep learning on point clouds is still in its infancy due to the unique challenges faced by the processing of point clouds with deep neural networks. Recently, deep learning on point clouds has become even thriving, with numerous methods being proposed to address different problems in this area. To stimulate future research, this paper presents a comprehensive review of recent progress in deep learning methods for point clouds. It covers three major tasks, including 3D shape classification, 3D object detection and tracking, and 3D point cloud segmentation. It also presents comparative results on several publicly available datasets, together with insightful observations and inspiring future research directions.}, bibtype = {article}, author = {Guo, Yulan and Wang, Hanyun and Hu, Qingyong and Liu, Hao and Liu, Li and Bennamoun, Mohammed} }
@article{ title = {PV-RCNN: Point-Voxel Feature Set Abstraction for 3D Object Detection}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1912.13192}, id = {b74da4d0-777e-3cde-946d-e387c025516f}, created = {2020-10-01T13:48:35.424Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-02T09:06:26.288Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Shi2019}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We present a novel and high-performance 3D object detection framework, named PointVoxel-RCNN (PV-RCNN), for accurate 3D object detection from point clouds. Our proposed method deeply integrates both 3D voxel Convolutional Neural Network (CNN) and PointNet-based set abstraction to learn more discriminative point cloud features. It takes advantages of efficient learning and high-quality proposals of the 3D voxel CNN and the flexible receptive fields of the PointNet-based networks. Specifically, the proposed framework summarizes the 3D scene with a 3D voxel CNN into a small set of keypoints via a novel voxel set abstraction module to save follow-up computations and also to encode representative scene features. Given the high-quality 3D proposals generated by the voxel CNN, the RoI-grid pooling is proposed to abstract proposal-specific features from the keypoints to the RoI-grid points via keypoint set abstraction with multiple receptive fields. Compared with conventional pooling operations, the RoI-grid feature points encode much richer context information for accurately estimating object confidences and locations. Extensive experiments on both the KITTI dataset and the Waymo Open dataset show that our proposed PV-RCNN surpasses state-of-the-art 3D detection methods with remarkable margins by using only point clouds.}, bibtype = {article}, author = {Shi, Shaoshuai and Guo, Chaoxu and Jiang, Li and Wang, Zhe and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng} }
@article{ title = {Lossy Point Cloud Geometry Compression via End-to-End Learning}, type = {article}, year = {2019}, pages = {1-13}, websites = {http://arxiv.org/abs/1909.12037}, id = {c6785f94-996b-3081-afcc-9d7863f5bfdd}, created = {2020-10-06T08:41:12.197Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-12T07:47:40.901Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2019}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,1853f94b-7af1-40fa-b068-4758e9a02bc4,8d18e62e-6e66-4acb-ae6a-b470435041d8,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {This paper presents a novel end-to-end Learned Point Cloud Geometry Compression (a.k.a., Learned-PCGC) framework, to efficiently compress the point cloud geometry (PCG) using deep neural networks (DNN) based variational autoencoders (VAE). In our approach, PCG is first voxelized, scaled and partitioned into non-overlapped 3D cubes, which is then fed into stacked 3D convolutions for compact latent feature and hyperprior generation. Hyperpriors are used to improve the conditional probability modeling of latent features. A weighted binary cross-entropy (WBCE) loss is applied in training while an adaptive thresholding is used in inference to remove unnecessary voxels and reduce the distortion. Objectively, our method exceeds the geometry-based point cloud compression (G-PCC) algorithm standardized by well-known Moving Picture Experts Group (MPEG) with a significant performance margin, e.g., at least 60% BD-Rate (Bjontegaard Delta Rate) gains, using common test datasets. Subjectively, our method has presented better visual quality with smoother surface reconstruction and appealing details, in comparison to all existing MPEG standard compliant PCC methods. Our method requires about 2.5MB parameters in total, which is a fairly small size for practical implementation, even on embedded platform. Additional ablation studies analyze a variety of aspects (e.g., cube size, kernels, etc) to explore the application potentials of our learned-PCGC.}, bibtype = {article}, author = {Wang, Jianqiang and Zhu, Hao and Ma, Zhan and Chen, Tong and Liu, Haojie and Shen, Qiu} }
@article{ title = {Dynamic graph Cnn for learning on point clouds}, type = {article}, year = {2019}, keywords = {Classification,Point cloud,Segmentation}, volume = {38}, id = {651a7f3a-723e-35d6-82ac-ace575d379b3}, created = {2020-10-15T09:39:12.549Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:39.451Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Point clouds provide a flexible geometric representation suitable for countless applications in computer graphics; they also comprise the raw output of most 3D data acquisition devices. While hand-designed features on point clouds have long been proposed in graphics and vision, however, the recent overwhelming success of convolutional neural networks (CNNs) for image analysis suggests the value of adapting insight from CNN to the point cloud world. Point clouds inherently lack topological information, so designing a model to recover topology can enrich the representation power of point clouds. To this end, we propose a new neural network module dubbed EdgeConv suitable for CNN-based high-level tasks on point clouds, including classification and segmentation. EdgeConv acts on graphs dynamically computed in each layer of the network. It is differentiable and can be plugged into existing architectures. Compared to existing modules operating in extrinsic space or treating each point independently, EdgeConv has several appealing properties: It incorporates local neighborhood information; it can be stacked applied to learn global shape properties; and in multi-layer systems affinity in feature space captures semantic characteristics over potentially long distances in the original embedding. We show the performance of our model on standard benchmarks, including ModelNet40, ShapeNetPart, and S3DIS.}, bibtype = {article}, author = {Wang, Yue and Sun, Yongbin and Liu, Ziwei and Sarma, Sanjay E. and Bronstein, Michael M. and Solomon, Justin M.}, doi = {10.1145/3326362}, journal = {ACM Transactions on Graphics}, number = {5} }
@article{ title = {MeshCNN : A Network with an Edge}, type = {article}, year = {2019}, volume = {1}, id = {17322063-e03a-3140-b8c4-520799e17a58}, created = {2020-10-20T09:48:06.313Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T08:16:07.027Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Hanocka, Rana}, number = {1} }
@inproceedings{ title = {Deep end-to-end alignment and refinement for time-of-flight RGB-D module}, type = {inproceedings}, year = {2019}, pages = {9993-10002}, volume = {2019-Octob}, id = {6b87a0e4-40bd-3eee-b3e5-96ed49714212}, created = {2020-10-23T05:18:20.985Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-23T05:19:06.382Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Recently, it is increasingly popular to equip mobile RGB cameras with Time-of-Flight (ToF) sensors for active depth sensing. However, for off-the-shelf ToF sensors, one must tackle two problems in order to obtain high-quality depth with respect to the RGB camera, namely 1) online calibration and alignment; and 2) complicated error correction for ToF depth sensing. In this work, we propose a framework for jointly alignment and refinement via deep learning. First, a cross-modal optical flow between the RGB image and the ToF amplitude image is estimated for alignment. The aligned depth is then refined via an improved kernel predicting network that performs kernel normalization and applies the bias prior to the dynamic convolution. To enrich our data for end-to-end training, we have also synthesized a dataset using tools from computer graphics. Experimental results demonstrate the effectiveness of our approach, achieving state-of-the-art for ToF refinement.}, bibtype = {inproceedings}, author = {Qiu, Di and Pang, Jiahao and Sun, Wenxiu and Yang, Chengxi}, doi = {10.1109/ICCV.2019.01009}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Robust normal estimation for 3D LiDAR point clouds in urban environments}, type = {article}, year = {2019}, keywords = {LiDAR point cloud,Robust normal estimation,Segmentation,Urban environments}, pages = {1-17}, volume = {19}, id = {c98ec297-6379-3956-a0e9-1144ed5d9997}, created = {2020-11-03T13:16:20.006Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-03T13:17:25.149Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Normal estimation is a crucial first step for numerous light detection and ranging (LiDAR) data-processing algorithms, from building reconstruction, road extraction, and ground-cover classification to scene rendering. For LiDAR point clouds in urban environments, this paper presents a robust method to estimate normals by constructing an octree-based hierarchical representation for the data and detecting a group of large enough consistent neighborhoods at multiscales. Consistent neighborhoods are mainly determined based on the observation that an urban environment is typically comprised of regular objects, e.g., buildings, roads, and the ground surface, and irregular objects, e.g., trees and shrubs; the surfaces of most regular objects can be approximatively represented by a group of local planes. Even in the frequent presence of heavy noise and anisotropic point samplings in LiDAR data, our method is capable of estimating robust normals for kinds of objects in urban environments, and the estimated normals are beneficial to more accurately segment and identify the objects, as well as preserving their sharp features and complete outlines. The proposed method was experimentally validated both on synthetic and real urban LiDAR datasets, and was compared to state-of-the-art methods.}, bibtype = {article}, author = {Zhao, Ruibin and Pang, Mingyong and Liu, Caixia and Zhang, Yanling}, doi = {10.3390/s19051248}, journal = {Sensors (Switzerland)}, number = {5} }
@article{ title = {Nesti-net: Normal estimation for unstructured 3D point clouds using convolutional neural networks}, type = {article}, year = {2019}, keywords = {Deep Learning,Vision + Graphics}, pages = {10104-10112}, volume = {2019-June}, id = {fbff6a63-bd73-3fc2-a6f8-9080279be3ee}, created = {2020-11-03T13:16:20.066Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-09T09:00:51.702Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper, we propose a normal estimation method for unstructured 3D point clouds. This method, called Nesti-Net, builds on a new local point cloud representation which consists of multi-scale point statistics (MuPS), estimated on a local coarse Gaussian grid. This representation is a suitable input to a CNN architecture. The normals are estimated using a mixture-of-experts (MoE) architecture, which relies on a data-driven approach for selecting the optimal scale around each point and encourages sub-network specialization. Interesting insights into the network's resource distribution are provided. The scale prediction significantly improves robustness to different noise levels, point density variations and different levels of detail. We achieve state-of-the-art results on a benchmark synthetic dataset and present qualitative results on real scanned scenes.}, bibtype = {article}, author = {Ben-Shabat, Yizhak and Lindenbaum, Michael and Fischer, Anath}, doi = {10.1109/CVPR.2019.01035}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Unsupervised domain adaptation for TOF data denoising with adversarial learning}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,Datasets and Evaluation,Deep Learning,RGBD sensors and analytics}, pages = {5579-5586}, volume = {2019-June}, id = {d40340d8-6493-3ef9-a045-04a035f6f397}, created = {2020-11-03T13:16:20.290Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-11T06:38:12.936Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Time-of-Flight data is typically affected by a high level of noise and by artifacts due to Multi-Path Interference (MPI). While various traditional approaches for ToF data improvement have been proposed, machine learning techniques have seldom been applied to this task, mostly due to the limited availability of real world training data with depth ground truth. In this paper, we avoid to rely on labeled real data in the learning framework. A Coarse-Fine CNN, able to exploit multi-frequency ToF data for MPI correction, is trained on synthetic data with ground truth in a supervised way. In parallel, an adversarial learning strategy, based on the Generative Adversarial Networks (GAN) framework, is used to perform an unsupervised pixel-level domain adaptation from synthetic to real world data, exploiting unlabeled real world acquisitions. Experimental results demonstrate that the proposed approach is able to effectively denoise real world data and to outperform state-of-the-art techniques.}, bibtype = {article}, author = {Agresti, Gianluca and Schaefer, Henrik and Sartor, Piergiorgio and Zanuttigh, Pietro}, doi = {10.1109/CVPR.2019.00573}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Ground-aware point cloud semantic segmentation for autonomous driving}, type = {article}, year = {2019}, keywords = {Autonomous driving,Point clouds,Semantic segmentation,Sparse LiDAR}, pages = {971-979}, id = {eff4ca24-3521-342e-8d67-4bf8ff9399fc}, created = {2020-11-03T13:35:21.010Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-03T13:35:28.245Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Semantic understanding of 3D scenes is essential for autonomous driving. Although a number of efforts have been devoted to semantic segmentation of dense point clouds, the great sparsity of 3D LiDAR data poses significant challenges in autonomous driving. In this paper, we work on the semantic segmentation problem of extremely sparse LiDAR point clouds with specific consideration of the ground as reference. In particular, we propose a ground-aware framework that well solves the ambiguity caused by data sparsity. We employ a multi-section plane fitting approach to roughly extract ground points to assist segmentation of objects on the ground. Based on the roughly extracted ground points, our approach implicitly integrates the ground information in a weakly-supervised manner and utilizes ground-aware features with a new ground-aware attention module. The proposed ground-aware attention module captures long-range dependence between ground and objects, which significantly facilitates the segmentation of small objects that only consist of a few points in extremely sparse point clouds. Extensive experiments on two large-scale LiDAR point cloud datasets for autonomous driving demonstrate that the proposed method achieves state-of-the-art performance both quantitatively and qualitatively. The project and dataset are available at www.moonx.ai/#/open.}, bibtype = {article}, author = {Wu, Jian and Jiao, Jianbo and Yang, Qingxiong and Zha, Zheng Jun and Chen, Xuejin}, doi = {10.1145/3343031.3351076}, journal = {MM 2019 - Proceedings of the 27th ACM International Conference on Multimedia} }
@article{ title = {NormNet: Point-wise normal estimation network for three-dimensional point cloud data}, type = {article}, year = {2019}, keywords = {3-D deep learning,3-D indoor LiDAR data set,3-D sensor system,Normal estimation,robustness}, pages = {1-11}, volume = {16}, id = {1a464760-9c54-354d-9923-621f9a30fea3}, created = {2020-11-13T11:34:37.182Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:34:55.318Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this article, a point-wise normal estimation network for three-dimensional point cloud data called NormNet is proposed. We propose the multiscale K-nearest neighbor convolution module for strengthened local feature extraction. With the multiscale K-nearest neighbor convolution module and PointNet-like architecture, we achieved a hybrid of three features: a global feature, a semantic feature from the segmentation network, and a local feature from the multiscale K-nearest neighbor convolution module. Those features, by mutually supporting each other, not only increase the normal estimation performance but also enable the estimation to be robust under severe noise perturbations or point deficiencies. The performance was validated in three different data sets: Synthetic CAD data (ModelNet), RGB-D sensor-based real 3D PCD (S3DIS), and LiDAR sensor-based real 3D PCD that we built and shared.}, bibtype = {article}, author = {Hyeon, Janghun and Lee, Weonsuk and Kim, Joo Hyung and Doh, Nakju}, doi = {10.1177/1729881419857532}, journal = {International Journal of Advanced Robotic Systems}, number = {4} }
@article{ title = {RGB-D-Based Object Recognition Using Multimodal Convolutional Neural Networks: A Survey}, type = {article}, year = {2019}, keywords = {Convolutional neural network,RGB-D,multimodal fusion,object recognition,survey}, pages = {43110-43136}, volume = {7}, publisher = {IEEE}, id = {3dbabd47-6c64-3879-8810-29799b75c2e5}, created = {2020-11-16T11:56:20.670Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-04T15:41:02.760Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Gao2019}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,f3937f49-b3bb-4fad-a335-4fb2125beca8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Object recognition in real-world environments is one of the fundamental and key tasks in computer vision and robotics communities. With the advanced sensing technologies and low-cost depth sensors, the high-quality RGB and depth images can be recorded synchronously, and the object recognition performance can be improved by jointly exploiting them. RGB-D-based object recognition has evolved from early methods that using hand-crafted representations to the current state-of-the-art deep learning-based methods. With the undeniable success of deep learning, especially convolutional neural networks (CNNs) in the visual domain, the natural progression of deep learning research points to problems involving larger and more complex multimodal data. In this paper, we provide a comprehensive survey of recent multimodal CNNs (MMCNNs)-based approaches that have demonstrated significant improvements over previous methods. We highlight two key issues, namely, training data deficiency and multimodal fusion. In addition, we summarize and discuss the publicly available RGB-D object recognition datasets and present a comparative performance evaluation of the proposed methods on these benchmark datasets. Finally, we identify promising avenues of research in this rapidly evolving field. This survey will not only enable researchers to get a good overview of the state-of-the-art methods for RGB-D-based object recognition but also provide a reference for other multimodal machine learning applications, e.g., multimodal medical image fusion, audio-visual speech recognition, and multimedia retrieval and generation.}, bibtype = {article}, author = {Gao, Mingliang and Jiang, Jun and Zou, Guofeng and John, Vijay and Liu, Zheng}, doi = {10.1109/ACCESS.2019.2907071}, journal = {IEEE Access} }
@article{ title = {Floors are flat: Leveraging semantics for real-time surface normal prediction}, type = {article}, year = {2019}, keywords = {3d deep learning,3d prediction,Applications,Deep learning,Heometry,Mobile applications,Semantic labeling,Shape,Surface normals}, pages = {4065-4074}, id = {06fa73b0-0b63-387f-8d5c-881f3525578a}, created = {2020-12-04T10:44:07.991Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:53.207Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We propose 4 insights that help to significantly improve the performance of deep learning models that predict surface normals and semantic labels from a single RGB image. These insights are: (1) denoise the 'ground truth' surface normals in the training set to ensure consistency with the semantic labels; (2) concurrently train on a mix of real and synthetic data, instead of pretraining on synthetic and finetuning on real; (3) jointly predict normals and semantics using a shared model, but only backpropagate errors on pixels that have valid training labels; (4) slim down the model and use grayscale instead of color inputs. Despite the simplicity of these steps, we demonstrate consistently improved state of the art results on several datasets, using a model that runs at 12 fps on a standard mobile phone.}, bibtype = {article}, author = {Hickson, Steven and Raveendran, Karthik and Fathi, Alireza and Murphy, Kevin and Essa, Irfan}, doi = {10.1109/ICCVW.2019.00501}, journal = {Proceedings - 2019 International Conference on Computer Vision Workshop, ICCVW 2019} }
@article{ title = {Uncertainty-aware occupancy map prediction using generative networks for robot navigation}, type = {article}, year = {2019}, pages = {5453-5459}, volume = {2019-May}, id = {08747f31-5373-3f5d-ba74-bb50b77d503c}, created = {2021-01-25T14:53:33.514Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:14.223Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,520159f7-1eb4-4d90-925c-ce42ce7fb9d4,13d43b82-d9b4-40a8-9031-8e926a718ef0}, private_publication = {false}, abstract = {Efficient exploration through unknown environments remains a challenging problem for robotic systems. In these situations, the robot's ability to reason about its future motion is often severely limited by sensor field of view (FOV). By contrast, biological systems routinely make decisions by taking into consideration what might exist beyond their FOV based on prior experience. We present an approach for predicting occupancy map representations of sensor data for future robot motions using deep neural networks. We develop a custom loss function used to make accurate prediction while emphasizing physical boundaries. We further study extensions to our neural network architecture to account for uncertainty and ambiguity inherent in mapping and exploration. Finally, we demonstrate a combined map prediction and information-theoretic exploration strategy using the variance of the generated hypotheses as the heuristic for efficient exploration of unknown environments.}, bibtype = {article}, author = {Katyal, Kapil and Popek, Katie and Paxton, Chris and Burlina, Phil and Hager, Gregory D.}, doi = {10.1109/ICRA.2019.8793500}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@inproceedings{ title = {Deep end-to-end alignment and refinement for time-of-flight RGB-D module}, type = {inproceedings}, year = {2019}, pages = {9993-10002}, volume = {2019-Octob}, websites = {http://arxiv.org/abs/1909.07623}, month = {10}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {1}, id = {e5c6d9e4-4b16-3372-a929-fccb8818f5bd}, created = {2021-01-27T12:45:43.298Z}, accessed = {2021-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T08:25:31.084Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Recently, it is increasingly popular to equip mobile RGB cameras with Time-of-Flight (ToF) sensors for active depth sensing. However, for off-the-shelf ToF sensors, one must tackle two problems in order to obtain high-quality depth with respect to the RGB camera, namely 1) online calibration and alignment; and 2) complicated error correction for ToF depth sensing. In this work, we propose a framework for jointly alignment and refinement via deep learning. First, a cross-modal optical flow between the RGB image and the ToF amplitude image is estimated for alignment. The aligned depth is then refined via an improved kernel predicting network that performs kernel normalization and applies the bias prior to the dynamic convolution. To enrich our data for end-to-end training, we have also synthesized a dataset using tools from computer graphics. Experimental results demonstrate the effectiveness of our approach, achieving state-of-the-art for ToF refinement.}, bibtype = {inproceedings}, author = {Qiu, Di and Pang, Jiahao and Sun, Wenxiu and Yang, Chengxi}, doi = {10.1109/ICCV.2019.01009}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Point-Voxel CNN for Efficient 3D Deep Learning}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1907.03739}, month = {7}, publisher = {arXiv}, day = {8}, id = {425addb3-86d0-39f7-8265-ad5b4dc70072}, created = {2021-01-29T08:57:52.773Z}, accessed = {2021-01-29}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-06T17:13:24.815Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We present Point-Voxel CNN (PVCNN) for efficient, fast 3D deep learning. Previous work processes 3D data using either voxel-based or point-based NN models. However, both approaches are computationally inefficient. The computation cost and memory footprints of the voxel-based models grow cubically with the input resolution, making it memory-prohibitive to scale up the resolution. As for point-based networks, up to 80% of the time is wasted on structuring the sparse data which have rather poor memory locality, not on the actual feature extraction. In this paper, we propose PVCNN that represents the 3D input data in points to reduce the memory consumption, while performing the convolutions in voxels to reduce the irregular, sparse data access and improve the locality. Our PVCNN model is both memory and computation efficient. Evaluated on semantic and part segmentation datasets, it achieves much higher accuracy than the voxel-based baseline with 10x GPU memory reduction; it also outperforms the state-of-the-art point-based models with 7x measured speedup on average. Remarkably, the narrower version of PVCNN achieves 2x speedup over PointNet (an extremely efficient model) on part and scene segmentation benchmarks with much higher accuracy. We validate the general effectiveness of PVCNN on 3D object detection: by replacing the primitives in Frustrum PointNet with PVConv, it outperforms Frustrum PointNet++ by 2.4% mAP on average with 1.5x measured speedup and GPU memory reduction.}, bibtype = {article}, author = {Liu, Zhijian and Tang, Haotian and Lin, Yujun and Han, Song}, journal = {arXiv} }
@article{ title = {Point-voxel CNN for efficient 3d deep learning}, type = {article}, year = {2019}, id = {b1862cbf-44d3-3def-8738-6dd82aa4b814}, created = {2021-02-08T06:29:49.042Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:56.985Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We present Point-Voxel CNN (PVCNN) for efficient, fast 3D deep learning. Previous work processes 3D data using either voxel-based or point-based NN models. However, both approaches are computationally inefficient. The computation cost and memory footprints of the voxel-based models grow cubically with the input resolution, making it memory-prohibitive to scale up the resolution. As for point-based networks, up to 80% of the time is wasted on structuring the irregular data which have rather poor memory locality, not on the actual feature extraction. In this paper, we propose PVCNN that represents the 3D input data in points to reduce the memory consumption, while performing the convolutions in voxels to largely reduce the irregular data access and improve the locality. Our PVCNN model is both memory and computation efficient. Evaluated on semantic and part segmentation datasets, it achieves much higher accuracy than the voxel-based baseline with 10× GPU memory reduction; it also outperforms the state-of-the-art point-based models with 7× measured speedup on average. Remarkably, narrower version of PVCNN achieves 2× speedup over PointNet (an extremely efficient model) on part and scene segmentation benchmarks with much higher accuracy. We validate the general effectiveness of our PVCNN on 3D object detection: by replacing the primitives in Frustrum PointNet with PVConv, it outperforms Frustrum PointNet++ by 2.4% mAP on average with 1.5× measured speedup and GPU memory reduction.}, bibtype = {article}, author = {Liu, Zhijian and Tang, Haotian and Lin, Yujun and Han, Song}, journal = {arXiv}, number = {NeurIPS} }
@inproceedings{ title = {Searching for mobileNetV3}, type = {inproceedings}, year = {2019}, pages = {1314-1324}, volume = {2019-October}, websites = {http://arxiv.org/abs/1905.02244}, month = {10}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {1}, id = {e12bc69a-6961-3901-b8b3-5a1936ee2494}, created = {2021-02-09T07:29:08.288Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:54.880Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2% more accurate on ImageNet classification while reducing latency by 20% compared to MobileNetV2. MobileNetV3-Small is 6.6% more accurate compared to a MobileNetV2 model with comparable latency. MobileNetV3-Large detection is over 25% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 34% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.}, bibtype = {inproceedings}, author = {Howard, Andrew and Sandler, Mark and Chen, Bo and Wang, Weijun and Chen, Liang Chieh and Tan, Mingxing and Chu, Grace and Vasudevan, Vijay and Zhu, Yukun and Pang, Ruoming and Le, Quoc and Adam, Hartwig}, doi = {10.1109/ICCV.2019.00140}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Searching for MobileNetV3}, type = {article}, year = {2019}, pages = {1314-1324}, volume = {2019-Octob}, websites = {http://arxiv.org/abs/1905.02244}, month = {5}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {6}, id = {4f9c4253-7dfc-3a66-8c66-3577e9e93715}, created = {2021-02-09T07:46:41.904Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T14:17:31.284Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2\% more accurate on ImageNet classification while reducing latency by 15\% compared to MobileNetV2. MobileNetV3-Small is 4.6\% more accurate while reducing latency by 5\% compared to MobileNetV2. MobileNetV3-Large detection is 25\% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 30\% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.}, bibtype = {article}, author = {Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and Le, Quoc V. and Adam, Hartwig}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Point-Voxel CNN for Efficient 3D Deep Learning}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1907.03739}, month = {7}, publisher = {arXiv}, day = {8}, id = {e09f2027-4299-33d9-bfb4-dd125c445650}, created = {2021-02-09T07:49:16.863Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-05-05T08:36:17.529Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Liu2019}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {We present Point-Voxel CNN (PVCNN) for efficient, fast 3D deep learning. Previous work processes 3D data using either voxel-based or point-based NN models. However, both approaches are computationally inefficient. The computation cost and memory footprints of the voxel-based models grow cubically with the input resolution, making it memory-prohibitive to scale up the resolution. As for point-based networks, up to 80% of the time is wasted on structuring the sparse data which have rather poor memory locality, not on the actual feature extraction. In this paper, we propose PVCNN that represents the 3D input data in points to reduce the memory consumption, while performing the convolutions in voxels to reduce the irregular, sparse data access and improve the locality. Our PVCNN model is both memory and computation efficient. Evaluated on semantic and part segmentation datasets, it achieves much higher accuracy than the voxel-based baseline with 10x GPU memory reduction; it also outperforms the state-of-the-art point-based models with 7x measured speedup on average. Remarkably, the narrower version of PVCNN achieves 2x speedup over PointNet (an extremely efficient model) on part and scene segmentation benchmarks with much higher accuracy. We validate the general effectiveness of PVCNN on 3D object detection: by replacing the primitives in Frustrum PointNet with PVConv, it outperforms Frustrum PointNet++ by 2.4% mAP on average with 1.5x measured speedup and GPU memory reduction.}, bibtype = {article}, author = {Liu, Zhijian and Tang, Haotian and Lin, Yujun and Han, Song}, journal = {arXiv} }
@article{ title = {A 3D Robot Self Filter for Next Best View Planning}, type = {article}, year = {2019}, keywords = {Motion planning,Next best view,Range sensing,Scene interpretation}, pages = {117-124}, publisher = {IEEE}, id = {338c3f96-5209-3213-ae7a-056bcd9ca0f2}, created = {2021-02-09T08:36:10.711Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:35.190Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {520159f7-1eb4-4d90-925c-ce42ce7fb9d4}, private_publication = {false}, abstract = {This paper investigates the use of a real-time self filter for a robot manipulator in next best view planning tasks. The robot is equipped with a depth sensor in eye-in-hand configuration. The goal of the next best view algorithm is to select at each iteration an optimal view pose for the sensor in order to optimize information gain to perform 3D reconstruction of a region of interest. An OpenGL-based filter was adopted, that is able to determine which pixels of the depth image are due to robot self observations. The filter was adapted to work with KinectFusion volumetric based 3D reconstruction. Experiments have been performed in a real scenario. Results indicate that removal of robot self observations prevents artifacts in the final 3D representation of the environment. Moreover, view poses where the robot would occlude the target regions can be successfully avoided. Finally, it is shown that a convex-hull robot model is preferable to a tight 3D CAD model, and that the filter can be integrated with a surfel-based next best view planner with negligible overhead.}, bibtype = {article}, author = {Monica, Riccardo and Aleotti, Jacopo}, doi = {10.1109/IRC.2019.00025}, journal = {Proceedings - 3rd IEEE International Conference on Robotic Computing, IRC 2019} }
@article{ title = {Volumetric next best view by 3d occupancy mapping using markov chain gibbs sampler for precise manufacturing}, type = {article}, year = {2019}, keywords = {3D reconstruction,Active vision,Markov chain Monte Carlo,Occupancy mapping,Viewpoint planning}, pages = {121949-121960}, volume = {7}, publisher = {IEEE}, id = {39e27072-ec0e-3a0c-8a70-475c9c1fd166}, created = {2021-02-09T08:36:10.875Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:37.841Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {520159f7-1eb4-4d90-925c-ce42ce7fb9d4}, private_publication = {false}, abstract = {In this paper, we propose a model-free volumetric Next Best View (NBV) algorithm for accurate 3D reconstruction using a Markov Chain Monte Carlo method for high-mix-low-volume objects in manufacturing. The volumetric information gain based Next Best View algorithm can in real-time select the next optimal view that reveals the maximum uncertainty of the scanning environment with respect to a partially reconstructed 3D Occupancy map, without any priori knowledge of the target. Traditional Occupancy grid maps make two independence assumptions for computational tractability but suffer from the overconfident estimation of the occupancy probability for each voxel leading to less precise surface reconstructions. This paper proposes a special case of the Markov Chain Monte Carlo (MCMC) method, the Gibbs sampler, to accurately estimate the posterior occupancy probability of a voxel by randomly sampling from its high-dimensional full posterior occupancy probability given the entire volumetric map with respect to the forward sensor model with a Gaussian distribution. Numerical experiments validate the performance of the MCMC Gibbs sampler algorithm under the ROS-Industry framework to prove the accuracy of the reconstructed Occupancy map and the completeness of the registered point cloud. The proposed MCMC Occupancy mapping could be used to optimise the tuning parameters of the online NBV algorithms via the inverse sensor model to realise industry automation.}, bibtype = {article}, author = {Hou, Lei and Chen, Xiaopeng and Lan, Kunyan and Rasmussen, Rune and Roberts, Jonathan}, doi = {10.1109/ACCESS.2019.2935547}, journal = {IEEE Access}, number = {Mcmc} }
@article{ title = {Fitting cylindrical objects in 3-d point cloud using contextual and geometrical constraints}, type = {article}, year = {2019}, keywords = {3D point cloud,Cylinder fitting for finding,Primitive shape estimation,Quality of samples,RANSAC variations}, pages = {41-60}, volume = {35}, id = {7df8173f-0123-35a0-8b60-9077c8e1eb2b}, created = {2021-02-09T17:05:46.613Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:13:15.578Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {beecb55d-84d0-48a2-a344-e50cfe559467,642b5766-9420-4264-86e7-5aeb1c875ade}, private_publication = {false}, abstract = {In this paper, we propose a framework for fitting cylindrical objects toward deploying an object-finding-aided system for visually impaired people. The proposed framework consists of a RANSAC-based algorithm and a model verification scheme. The proposed robust estimator named GCSAC (Geometrical Constraint SAmple Consensus) avoids expensive computation of the RANSAC-based algorithms due to its random drawing of samples. To do this, GCSAC utilizes some geometrical constraints for selecting good samples. These constraints are raised from real scenarios or practical applications. First, the samples must ensure being consistent with the estimated model; second, the selected samples must satisfy explicit geometrical constraints of the interested objects. In addition, the estimated model is verified by using contextual constraints, which could be raised from a certain scene such as object standing on a table plane, size of object, and so on. GCSAC's implementations are carried out for various estimation problems on the synthesized dataset. The comparisons between GCSAC and MLESAC algorithm are implemented on three public datasets in terms of accuracy of the estimated model and the computational time. Details of algorithm implementation and evaluation datasets are publicly available.}, bibtype = {article}, author = {Vu, Hai and Le, Van Hung and Nguyen, Thi Thuy and Le, Thi Lan and Tran, Thanh Hai}, doi = {10.6688/JISE.201901_35(1).0003}, journal = {Journal of Information Science and Engineering}, number = {1} }
@article{ title = {Point Pair Feature Matching: Evaluating Methods to Detect Simple Shapes}, type = {article}, year = {2019}, keywords = {Object detection,Point Pair Features,Pose estimation}, pages = {445-456}, volume = {11754 LNCS}, id = {f9767e6c-afee-3a32-8495-58887ca2d3c1}, created = {2021-02-09T17:13:15.137Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:13:18.992Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {642b5766-9420-4264-86e7-5aeb1c875ade}, private_publication = {false}, abstract = {A recent benchmark for 3D object detection and 6D pose estimation from RGB-D images shows the dominance of methods based on Point Pair Feature Matching (PPFM). Since its invention in 2010 several modifications have been proposed to cope with its weaknesses, which are computational complexity, sensitivity to noise, and difficulties in the detection of geometrically simple objects with planar surfaces and rotational symmetries. In this work we focus on the latter. We present a novel approach to automatically detect rotational symmetries by matching the object model to itself. Furthermore, we adapt methods for pose verification and use more discriminative features which incorporate global information into the Point Pair Feature. We also examine the effects of other, already existing extensions by testing them on our specialized dataset for geometrically primitive objects. Results show that particularly our handling of symmetries and the augmented features are able to boost recognition rates.}, bibtype = {article}, author = {Ziegler, Markus and Rudorfer, Martin and Kroischke, Xaver and Krone, Sebastian and Krüger, Jörg}, doi = {10.1007/978-3-030-34995-0_40}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Self-attention generative adversarial networks}, type = {article}, year = {2019}, pages = {12744-12753}, volume = {2019-June}, id = {247a22bb-d75f-3063-982f-db8d5efed35d}, created = {2021-02-15T14:14:03.608Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-16T07:08:08.390Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2f2b519d-56f0-4e04-b335-d8e25f087073}, private_publication = {false}, abstract = {In this paper, we propose the Self-Attention Generative Adversarial Network (SAGAN) which allows attention-driven, long-range dependency modeling for image generation tasks. Traditional convolutional GAXs generate high-resolution details as a function of only spatially local points in lower-resolution feature maps. In SAGAN, details can be generated using cues from all feature locations. Moreover, the discriminator can check that highly detailed features in distant portions of the image are consistent with each other. Furthermore, recent work has shown that generator conditioning affects GAN performance. Leveraging this insight, we apply spectral normalization to the GAN generator and find that this improves training dynamics. The proposed SAGAN performs better than prior work1, boosting the best published Inception score from 36.8 to 52.52 and reducing Fréhet Inception distance from 27.62 to 18.65 on the challenging ImageNet dataset. Visualization of the attention layers shows that the generator leverages neighborhoods that correspond to object shapes rather than local regions of fixed shape.}, bibtype = {article}, author = {Zhang, Han and Goodfellow, Ian and Metaxas, Dimitris and Odena, Augustus}, journal = {36th International Conference on Machine Learning, ICML 2019} }
@article{ title = {Point-voxel CNN for efficient 3d deep learning}, type = {article}, year = {2019}, id = {0ab9be07-c6b9-30ab-b76b-042de149c46e}, created = {2021-03-04T15:41:23.799Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:16:30.259Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {We present Point-Voxel CNN (PVCNN) for efficient, fast 3D deep learning. Previous work processes 3D data using either voxel-based or point-based NN models. However, both approaches are computationally inefficient. The computation cost and memory footprints of the voxel-based models grow cubically with the input resolution, making it memory-prohibitive to scale up the resolution. As for point-based networks, up to 80% of the time is wasted on structuring the irregular data which have rather poor memory locality, not on the actual feature extraction. In this paper, we propose PVCNN that represents the 3D input data in points to reduce the memory consumption, while performing the convolutions in voxels to largely reduce the irregular data access and improve the locality. Our PVCNN model is both memory and computation efficient. Evaluated on semantic and part segmentation datasets, it achieves much higher accuracy than the voxel-based baseline with 10× GPU memory reduction; it also outperforms the state-of-the-art point-based models with 7× measured speedup on average. Remarkably, narrower version of PVCNN achieves 2× speedup over PointNet (an extremely efficient model) on part and scene segmentation benchmarks with much higher accuracy. We validate the general effectiveness of our PVCNN on 3D object detection: by replacing the primitives in Frustrum PointNet with PVConv, it outperforms Frustrum PointNet++ by 2.4% mAP on average with 1.5× measured speedup and GPU memory reduction.}, bibtype = {article}, author = {Liu, Zhijian and Tang, Haotian and Lin, Yujun and Han, Song}, journal = {arXiv} }
@article{ title = {A new fast filtering algorithm for a 3D point cloud based on RGB-D information}, type = {article}, year = {2019}, pages = {1-21}, volume = {14}, id = {6b3bc417-57e1-31d3-94f2-edf8fee3a768}, created = {2021-03-08T09:43:04.113Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-19T07:57:50.869Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jia2019}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {A point cloud that is obtained by an RGB-D camera will inevitably be affected by outliers that do not belong to the surface of the object, which is due to the different viewing angles, light intensities, and reflective characteristics of the object surface and the limitations of the sensors. An effective and fast outlier removal method based on RGB-D information is proposed in this paper. This method aligns the color image to the depth image, and the color mapping image is converted to an HSV image. Then, the optimal segmentation threshold of the V image that is calculated by using the Otsu algorithm is applied to segment the color mapping image into a binary image, which is used to extract the valid point cloud from the original point cloud with outliers. The robustness of the proposed method to the noise types, light intensity and contrast is evaluated by using several experiments; additionally, the method is compared with other filtering methods and applied to independently developed foot scanning equipment. The experimental results show that the proposed method can remove all type of outliers quickly and effectively.}, bibtype = {article}, author = {Jia, Chaochuan and Yang, Ting and Wang, Chuanjiang and Fan, Binghui and He, Fugui}, doi = {10.1371/journal.pone.0220253}, journal = {PLoS ONE}, number = {8} }
@article{ title = {Total denoising: Unsupervised learning of 3D point cloud cleaning}, type = {article}, year = {2019}, pages = {52-60}, volume = {2019-Octob}, id = {c8121fbb-adb2-388e-b04d-48d000375bf6}, created = {2021-03-09T06:55:59.696Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-09T06:56:06.628Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {We show that denoising of 3D point clouds can be learned unsupervised, directly from noisy 3D point cloud data only. This is achieved by extending recent ideas from learning of unsupervised image denoisers to unstructured 3D point clouds. Unsupervised image denoisers operate under the assumption that a noisy pixel observation is a random realization of a distribution around a clean pixel value, which allows appropriate learning on this distribution to eventually converge to the correct value. Regrettably, this assumption is not valid for unstructured points: 3D point clouds are subject to total noise, i.e. deviations in all coordinates, with no reliable pixel grid. Thus, an observation can be the realization of an entire manifold of clean 3D points, which makes the quality of a naïve extension of unsupervised image denoisers to 3D point clouds unfortunately only little better than mean filtering. To overcome this, and to enable effective and unsupervised 3D point cloud denoising, we introduce a spatial prior term, that steers converges to the unique closest out of the many possible modes on the manifold. Our results demonstrate unsupervised denoising performance similar to that of supervised learning with clean data when given enough training examples - whereby we do not need any pairs of noisy and clean training data.}, bibtype = {article}, author = {Casajus, Pedro Hermosilla and Ritschel, Tobias and Ropinski, Timo}, doi = {10.1109/ICCV.2019.00014}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Total denoising: Unsupervised learning of 3D point cloud cleaning}, type = {article}, year = {2019}, pages = {52-60}, volume = {2019-Octob}, id = {9978ecf7-868c-31d5-b041-171a9e9e6f0c}, created = {2021-04-15T14:18:52.485Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T14:19:08.773Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {We show that denoising of 3D point clouds can be learned unsupervised, directly from noisy 3D point cloud data only. This is achieved by extending recent ideas from learning of unsupervised image denoisers to unstructured 3D point clouds. Unsupervised image denoisers operate under the assumption that a noisy pixel observation is a random realization of a distribution around a clean pixel value, which allows appropriate learning on this distribution to eventually converge to the correct value. Regrettably, this assumption is not valid for unstructured points: 3D point clouds are subject to total noise, i.e. deviations in all coordinates, with no reliable pixel grid. Thus, an observation can be the realization of an entire manifold of clean 3D points, which makes the quality of a naïve extension of unsupervised image denoisers to 3D point clouds unfortunately only little better than mean filtering. To overcome this, and to enable effective and unsupervised 3D point cloud denoising, we introduce a spatial prior term, that steers converges to the unique closest out of the many possible modes on the manifold. Our results demonstrate unsupervised denoising performance similar to that of supervised learning with clean data when given enough training examples - whereby we do not need any pairs of noisy and clean training data.}, bibtype = {article}, author = {Casajus, Pedro Hermosilla and Ritschel, Tobias and Ropinski, Timo}, doi = {10.1109/ICCV.2019.00014}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {Iccv} }
@article{ title = {CNN-based Lidar Point Cloud De-Noising in Adverse Weather}, type = {article}, year = {2019}, pages = {2514-2521}, volume = {5}, id = {536e17e6-4de3-3825-9b11-4eae52b63dcc}, created = {2021-04-15T14:18:52.485Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T14:19:05.353Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Lidar sensors are frequently used in environment perception for autonomous vehicles and mobile robotics to complement camera, radar, and ultrasonic sensors. Adverse weather conditions are significantly impacting the performance of lidar-based scene understanding by causing undesired measurement points that in turn effect missing detections and false positives. In heavy rain or dense fog, water drops could be misinterpreted as objects in front of the vehicle which brings a mobile robot to a full stop. In this paper, we present the first CNN-based approach to understand and filter out such adverse weather effects in point cloud data. Using a large data set obtained in controlled weather environments, we demonstrate a significant performance improvement of our method over state-of-the-art involving geometric filtering. Data is available at https://github.com/rheinzler/PointCloudDeNoising.}, bibtype = {article}, author = {Heinzler, Robin and Piewak, Florian and Schindler, Philipp and Stork, Wilhelm}, journal = {arXiv}, number = {2} }
@article{ title = {Self-supervised deep depth denoising}, type = {article}, year = {2019}, pages = {1242-1251}, volume = {2019-Octob}, id = {85a894b5-3c3c-3e68-be40-e634a9f9bca6}, created = {2021-04-16T09:17:51.101Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T09:17:54.223Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Depth perception is considered an invaluable source of information for various vision tasks. However, depth maps acquired using consumer-level sensors still suffer from non-negligible noise. This fact has recently motivated researchers to exploit traditional filters, as well as the deep learning paradigm, in order to suppress the aforementioned non-uniform noise, while preserving geometric details. Despite the effort, deep depth denoising is still an open challenge mainly due to the lack of clean data that could be used as ground truth. In this paper, we propose a fully convolutional deep autoencoder that learns to denoise depth maps, surpassing the lack of ground truth data. Specifically, the proposed autoencoder exploits multiple views of the same scene from different points of view in order to learn to suppress noise in a self-supervised end-to-end manner using depth and color information during training, yet only depth during inference. To enforce self-supervision, we leverage a differentiable rendering technique to exploit photometric supervision, which is further regularized using geometric and surface priors. As the proposed approach relies on raw data acquisition, a large RGB-D corpus is collected using Intel RealSense sensors. Complementary to a quantitative evaluation, we demonstrate the effectiveness of the proposed self-supervised denoising approach on established 3D reconstruction applications. Code is avalable at https://github.com/VCL3D/DeepDepthDenoising.}, bibtype = {article}, author = {Sterzentsenko, Vladimiros and Saroglou, Leonidas and Chatzitofis, Anargyros and Thermos, Spiros and Zioulis, Nikolaos and Doumanoglou, Alexandros and Zarpalas, DImitrios and Daras, Petros}, doi = {10.1109/ICCV.2019.00133}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@inproceedings{ title = {Searching for mobileNetV3}, type = {inproceedings}, year = {2019}, pages = {1314-1324}, volume = {2019-October}, websites = {http://arxiv.org/abs/1905.02244}, month = {10}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {1}, id = {4eba3fd9-d33e-3c99-b246-786a46a27a6e}, created = {2021-06-14T08:49:43.603Z}, accessed = {2021-06-14}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:49:43.603Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2% more accurate on ImageNet classification while reducing latency by 20% compared to MobileNetV2. MobileNetV3-Small is 6.6% more accurate compared to a MobileNetV2 model with comparable latency. MobileNetV3-Large detection is over 25% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 34% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.}, bibtype = {inproceedings}, author = {Howard, Andrew and Sandler, Mark and Chen, Bo and Wang, Weijun and Chen, Liang Chieh and Tan, Mingxing and Chu, Grace and Vasudevan, Vijay and Zhu, Yukun and Pang, Ruoming and Le, Quoc and Adam, Hartwig}, doi = {10.1109/ICCV.2019.00140}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision} }
@inproceedings{ title = {Searching for mobileNetV3}, type = {inproceedings}, year = {2019}, pages = {1314-1324}, volume = {2019-October}, websites = {http://arxiv.org/abs/1905.02244}, month = {10}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {1}, id = {a2186af3-ffac-354b-9489-2425dcaa2da6}, created = {2021-06-14T08:51:32.041Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:51:53.898Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2% more accurate on ImageNet classification while reducing latency by 20% compared to MobileNetV2. MobileNetV3-Small is 6.6% more accurate compared to a MobileNetV2 model with comparable latency. MobileNetV3-Large detection is over 25% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 34% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.}, bibtype = {inproceedings}, author = {Howard, Andrew and Sandler, Mark and Chen, Bo and Wang, Weijun and Chen, Liang Chieh and Tan, Mingxing and Chu, Grace and Vasudevan, Vijay and Zhu, Yukun and Pang, Ruoming and Le, Quoc and Adam, Hartwig}, doi = {10.1109/ICCV.2019.00140}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Simplifying Graph Convolutional Networks}, type = {article}, year = {2019}, id = {88c441f7-d78c-3144-bdca-8835045b4d7e}, created = {2021-07-12T09:25:31.899Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:37.885Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {article}, author = {Wu, Felix and Zhang, Tianyi and Holanda, Amauri and Jr, De Souza and Fifty, Christopher and Yu, Tao and Convolutional, Graph and Gcns, Networks} }
@article{ title = {Janossy pooling: Learning deep permutation-invariant functions for variable-size inputs}, type = {article}, year = {2019}, pages = {1-21}, id = {53ec0ee4-26ee-361b-81a7-fae17120a47d}, created = {2021-07-12T09:42:27.132Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:40.893Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, abstract = {We consider a simple and overarching representation for permutation-invariant functions of sequences (or multiset functions). Our approach, which we call Janossy pooling, expresses a permutation-invariant function as the average of a permutation-sensitive function applied to all reorderings of the input sequence. This allows us to leverage the rich and mature literature on permutation-sensitive functions to construct novel and flexible permutation-invariant functions. If carried out naively, Janossy pooling can be computationally prohibitive. To allow computational tractability, we consider three kinds of approximations: canonical orderings of sequences, functions with k-order interactions, and stochastic optimization algorithms with random permutations. Our framework unifies a variety of existing work in the literature, and suggests possible modeling and algorithmic extensions. We explore a few in our experiments, which demonstrate improved performance over current state-of-the-art methods.}, bibtype = {article}, author = {Murphy, Ryan L. and Srinivasan, Balasubramaniam and Ribeiro, Bruno and Rao, Vinayak}, journal = {7th International Conference on Learning Representations, ICLR 2019} }
@article{ title = {Relational pooling for graph representations}, type = {article}, year = {2019}, pages = {8192-8202}, volume = {2019-June}, id = {42192898-6e83-3a44-9075-e659cba7a255}, created = {2021-07-12T10:19:36.744Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:57.962Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {This work generalizes graph neural networks (GNNs) beyond those based on the Weisfeiler-Lehman (WL) algorithm, graph Laplacians, and diffusions. Our approach, denoted Relational Pooling (RP), draws from the theory of finite partial exchangeability to provide a framework with maximal representation power for graphs. RP can work with existing graph representation models and, somewhat counterintuitively, can make them even more powerful than the original WL isomorphism test. Additionally, RP allows architectures like Recurrent Neural Networks and Convolutional Neural Networks to be used in a theoretically sound approach for graph classification. We demonstrate improved performance of RP-based graph representations over state-of-the-art methods on a number of tasks.}, bibtype = {article}, author = {Murphy, Ryan L. and Srinivasan, Balasubramaniam and Rao, Vinayak and Ribeiro, Bruno}, journal = {36th International Conference on Machine Learning, ICML 2019} }
@article{ title = {Position-aware graph neural networks}, type = {article}, year = {2019}, pages = {12372-12381}, volume = {2019-June}, id = {664d121b-d50a-3b8a-979e-c7f7f3b426f5}, created = {2021-07-12T10:19:36.761Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:58.935Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Learning node embeddings that capture a node's position within the broader graph structure is crucial for many prediction tasks on graphs. However, existing Graph Neural Network (GNN) architectures have limited power in capturing the position/location of a given node with respect to all other nodes of the graph. Here we propose Position-aware Graph Neural Networks (P-GNNs), a new class of GNNs for computing position-aware node embeddings. P-GNN first samples sets of anchor nodes, computes the distance of a given target node to each anchor-set, and then learns a non-linear distance-weighted aggregation scheme over the anchor-sets. This way P-GNNs can capture positions/locations of nodes with respect to the anchor nodes. P-GNNs have several advantages: they are inductive, scalable, and can incorporate node feature information. We apply P-GNNs to multiple prediction tasks including link prediction and community detection. We show that P-GNNs consistently outperform state of the art GNNs, with up to 66% improvement in terms of the ROC AUC score.}, bibtype = {article}, author = {You, Jiaxuan and Ying, Rex and Leskovec, Jure}, journal = {36th International Conference on Machine Learning, ICML 2019} }
@article{ title = {Weisfeiler and leman go neural: Higher-order graph neural networks}, type = {article}, year = {2019}, pages = {4602-4609}, id = {8c7ef763-64c1-3cda-900a-0d36d508e894}, created = {2021-07-12T10:19:36.814Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:20:04.991Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {In recent years, graph neural networks (GNNs) have emerged as a powerful neural architecture to learn vector representations of nodes and graphs in a supervised, end-to-end fashion. Up to now, GNNs have only been evaluated empirically-showing promising results. The following work investigates GNNs from a theoretical point of view and relates them to the 1-dimensional Weisfeiler-Leman graph isomorphism heuristic (1-WL). We show that GNNs have the same expressiveness as the 1-WL in terms of distinguishing non-isomorphic (sub-)graphs. Hence, both algorithms also have the same shortcomings. Based on this, we propose a generalization of GNNs, so-called k-dimensional GNNs (k-GNNs), which can take higher-order graph structures at multiple scales into account. These higher-order structures play an essential role in the characterization of social networks and molecule graphs. Our experimental evaluation confirms our theoretical findings as well as confirms that higher-order information is useful in the task of graph classification and regression.}, bibtype = {article}, author = {Morris, Christopher and Ritzert, Martin and Fey, Matthias and Hamilton, William L. and Lenssen, Jan Eric and Rattan, Gaurav and Grohe, Martin}, doi = {10.1609/aaai.v33i01.33014602}, journal = {33rd AAAI Conference on Artificial Intelligence, AAAI 2019, 31st Innovative Applications of Artificial Intelligence Conference, IAAI 2019 and the 9th AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2019} }
@article{ title = {The lottery ticket hypothesis: Finding sparse, trainable neural networks}, type = {article}, year = {2019}, pages = {1-42}, id = {039537a4-3a38-3976-b510-3947cf030d5e}, created = {2021-07-12T14:15:35.235Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:06.331Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Neural network pruning techniques can reduce the parameter counts of trained networks by over 90%, decreasing storage requirements and improving computational performance of inference without compromising accuracy. However, contemporary experience is that the sparse architectures produced by pruning are difficult to train from the start, which would similarly improve training performance. We find that a standard pruning technique naturally uncovers subnetworks whose initializations made them capable of training effectively. Based on these results, we articulate the lottery ticket hypothesis: dense, randomly-initialized, feed-forward networks contain subnetworks (winning tickets) that-when trained in isolation-reach test accuracy comparable to the original network in a similar number of iterations. The winning tickets we find have won the initialization lottery: their connections have initial weights that make training particularly effective. We present an algorithm to identify winning tickets and a series of experiments that support the lottery ticket hypothesis and the importance of these fortuitous initializations. We consistently find winning tickets that are less than 10-20% of the size of several fully-connected and convolutional feed-forward architectures for MNIST and CIFAR10. Above this size, the winning tickets that we find learn faster than the original network and reach higher test accuracy.}, bibtype = {article}, author = {Frankle, Jonathan and Carbin, Michael}, journal = {7th International Conference on Learning Representations, ICLR 2019} }
@article{ title = {On Warm-Starting Neural Network Training}, type = {article}, year = {2019}, pages = {1-21}, websites = {http://arxiv.org/abs/1910.08475}, id = {ab470935-de99-3aed-9452-3ac28e1c70a0}, created = {2021-07-12T14:15:35.250Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:08.266Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {In many real-world deployments of machine learning systems, data arrive piecemeal. These learning scenarios may be passive, where data arrive incrementally due to structural properties of the problem (e.g., daily financial data) or active, where samples are selected according to a measure of their quality (e.g., experimental design). In both of these cases, we are building a sequence of models that incorporate an increasing amount of data. We would like each of these models in the sequence to be performant and take advantage of all the data that are available to that point. Conventional intuition suggests that when solving a sequence of related optimization problems of this form, it should be possible to initialize using the solution of the previous iterate—to “warm start” the optimization rather than initialize from scratch—and see reductions in wall-clock time. However, in practice this warm-starting seems to yield poorer generalization performance than models that have fresh random initializations, even though the final training losses are similar. While it appears that some hyperparameter settings allow a practitioner to close this generalization gap, they seem to only do so in regimes that damage the wall-clock gains of the warm start. Nevertheless, it is highly desirable to be able to warm-start neural network training, as it would dramatically reduce the resource usage associated with the construction of performant deep learning systems. In this work, we take a closer look at this empirical phenomenon and try to understand when and how it occurs. We also provide a surprisingly simple trick that overcomes this pathology in several important situations, and present experiments that elucidate some of its properties.}, bibtype = {article}, author = {Ash, Jordan T. and Adams, Ryan P.}, number = {NeurIPS} }
@article{ title = {Ray Interference: a Source of Plateaus in Deep Reinforcement Learning}, type = {article}, year = {2019}, pages = {1-17}, websites = {http://arxiv.org/abs/1904.11455}, id = {d1a6c993-edcc-3dcf-b2a9-fffcb67d1e25}, created = {2021-07-12T14:15:35.627Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:42.509Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Rather than proposing a new method, this paper investigates an issue present in existing learning algorithms. We study the learning dynamics of reinforcement learning (RL), specifically a characteristic coupling between learning and data generation that arises because RL agents control their future data distribution. In the presence of function approximation, this coupling can lead to a problematic type of 'ray interference', characterized by learning dynamics that sequentially traverse a number of performance plateaus, effectively constraining the agent to learn one thing at a time even when learning in parallel is better. We establish the conditions under which ray interference occurs, show its relation to saddle points and obtain the exact learning dynamics in a restricted setting. We characterize a number of its properties and discuss possible remedies.}, bibtype = {article}, author = {Schaul, Tom and Borsa, Diana and Modayil, Joseph and Pascanu, Razvan} }
@article{ title = {A Deep Neural Network's Loss Surface Contains Every Low-dimensional Pattern}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1912.07559}, id = {92c12fb0-ac5d-3a4e-ae67-c8a751756931}, created = {2021-07-12T14:15:35.732Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:52.335Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {The work "Loss Landscape Sightseeing with Multi-Point Optimization" (Skorokhodov and Burtsev, 2019) demonstrated that one can empirically find arbitrary 2D binary patterns inside loss surfaces of popular neural networks. In this paper we prove that: (i) this is a general property of deep universal approximators; and (ii) this property holds for arbitrary smooth patterns, for other dimensionalities, for every dataset, and any neural network that is sufficiently deep and wide. Our analysis predicts not only the existence of all such low-dimensional patterns, but also two other properties that were observed empirically: (i) that it is easy to find these patterns; and (ii) that they transfer to other data-sets (e.g. a test-set).}, bibtype = {article}, author = {Czarnecki, Wojciech Marian and Osindero, Simon and Pascanu, Razvan and Jaderberg, Max} }
@article{ title = {Reconciling modern machine-learning practice and the classical bias–variance trade-off}, type = {article}, year = {2019}, keywords = {Bias–variance trade-off,Machine learning,Neural networks}, pages = {15849-15854}, volume = {116}, id = {eb513a7d-cef8-3994-bd3c-7b62fa9f2886}, created = {2021-07-12T14:15:35.908Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:57.595Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Breakthroughs in machine learning are rapidly changing science and society, yet our fundamental understanding of this technology has lagged far behind. Indeed, one of the central tenets of the field, the bias–variance trade-off, appears to be at odds with the observed behavior of methods used in modern machine-learning practice. The bias–variance trade-off implies that a model should balance underfitting and overfitting: Rich enough to express underlying structure in data and simple enough to avoid fitting spurious patterns. However, in modern practice, very rich models such as neural networks are trained to exactly fit (i.e., interpolate) the data. Classically, such models would be considered overfitted, and yet they often obtain high accuracy on test data. This apparent contradiction has raised questions about the mathematical foundations of machine learning and their relevance to practitioners. In this paper, we reconcile the classical understanding and the modern practice within a unified performance curve. This “double-descent” curve subsumes the textbook U-shaped bias–variance trade-off curve by showing how increasing model capacity beyond the point of interpolation results in improved performance. We provide evidence for the existence and ubiquity of double descent for a wide spectrum of models and datasets, and we posit a mechanism for its emergence. This connection between the performance and the structure of machine-learning models delineates the limits of classical analyses and has implications for both the theory and the practice of machine learning.}, bibtype = {article}, author = {Belkin, Mikhail and Hsu, Daniel and Ma, Siyuan and Mandal, Soumik}, doi = {10.1073/pnas.1903070116}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, number = {32} }
@article{ title = {Geometry of Deep Convolutional Networks}, type = {article}, year = {2019}, pages = {1-15}, websites = {http://arxiv.org/abs/1905.08922}, id = {838657a4-9834-3313-a23f-07258c75df6f}, created = {2021-07-12T14:15:36.019Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:01.218Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {We give a formal procedure for computing preimages of convolutional network outputs using the dual basis defined from the set of hyperplanes associated with the layers of the network. We point out the special symmetry associated with arrangements of hyperplanes of convolutional networks that take the form of regular multidimensional polyhedral cones. We discuss the efficiency of large number of layers of nested cones that result from incremental small size convolutions in order to give a good compromise between efficient contraction of data to low dimensions and shaping of preimage manifolds. We demonstrate how a specific network flattens a non linear input manifold to an affine output manifold and discuss its relevance to understanding classification properties of deep networks.}, bibtype = {article}, author = {Carlsson, Stefan} }
@article{ title = {Exploring RGB+depth fusion for real-time object detection}, type = {article}, year = {2019}, keywords = {Depth,Neural networks,Object detection,RGB,RGBD,Sensor fusion,Single-shot}, volume = {19}, id = {e38c2729-e97d-363a-94fb-51322fb74e8a}, created = {2021-07-20T14:11:55.202Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-17T08:32:39.163Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ophoff2019}, folder_uuids = {5cd4d7ce-c2fb-4e91-ab80-35deeb123df5}, private_publication = {false}, abstract = {In this paper, we investigate whether fusing depth information on top of normal RGB data for camera-based object detection can help to increase the performance of current state-of-the-art single-shot detection networks. Indeed, depth sensing is easily acquired using depth cameras such as a Kinect or stereo setups. We investigate the optimal manner to perform this sensor fusion with a special focus on lightweight single-pass convolutional neural network (CNN) architectures, enabling real-time processing on limited hardware. For this, we implement a network architecture allowing us to parameterize at which network layer both information sources are fused together. We performed exhaustive experiments to determine the optimal fusion point in the network, from which we can conclude that fusing towards the mid to late layers provides the best results. Our best fusion models significantly outperform the baseline RGB network in both accuracy and localization of the detections.}, bibtype = {article}, author = {Ophoff, Tanguy and Van Beeck, Kristof and Goedemé, Toon}, doi = {10.3390/s19040866}, journal = {Sensors (Switzerland)}, number = {4} }
@article{ title = {PU-GAN: A point cloud upsampling adversarial network}, type = {article}, year = {2019}, pages = {7202-7211}, volume = {2019-Octob}, id = {15fe8f1f-7dde-31ee-a010-557b4f36e3a6}, created = {2021-07-21T12:34:44.848Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-28T06:07:38.143Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Li2019}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Point clouds acquired from range scans are often sparse, noisy, and non-uniform. This paper presents a new point cloud upsampling network called PU-GAN, which is formulated based on a generative adversarial network (GAN), to learn a rich variety of point distributions from the latent space and upsample points over patches on object surfaces. To realize a working GAN network, we construct an up-down-up expansion unit in the generator for upsampling point features with error feedback and self-correction, and formulate a self-attention unit to enhance the feature integration. Further, we design a compound loss with adversarial, uniform and reconstruction terms, to encourage the discriminator to learn more latent patterns and enhance the output point distribution uniformity. Qualitative and quantitative evaluations demonstrate the quality of our results over the state-of-the-arts in terms of distribution uniformity, proximity-to-surface, and 3D reconstruction quality.}, bibtype = {article}, author = {Li, Ruihui and Li, Xianzhi and Fu, Chi Wing and Cohen-Or, Daniel and Heng, Pheng Ann}, doi = {10.1109/ICCV.2019.00730}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {c} }
@article{ title = {RealPoint3D: An Efficient Generation Network for 3D Object Reconstruction from a Single Image}, type = {article}, year = {2019}, keywords = {3D reconstruction,nearest shape retrieval,point cloud generation,projection,single image}, pages = {57539-57549}, volume = {7}, publisher = {IEEE}, id = {901da53b-9b68-3d0f-9182-274ba7cc6a64}, created = {2021-07-21T12:55:19.301Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.239Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhang2019}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {The generation of 3D models from a single image has recently received much attention, based on which point cloud generation methods have been developed. However, most current 3D reconstruction methods only work for relatively pure backgrounds, which limit their applications on real images. Meanwhile, more fine-grained details are required to provide finer models. This paper proposes an end-to-end efficient generation network, which is composed of an encoder, a 2D-3D fusion module, and a decoder. First, a single-object image and a nearest-shape retrieval from ShapeNet are fed into the network; then, the two encoders are integrated adaptively according to their information integrity, followed by the decoder to obtain fine-grained point clouds. The point cloud from the nearest shape effectively instructs the generation of finer point clouds. To have a consistent spatial distribution from multi-view observations, our algorithm adopts projection loss as an additional supervisor. The experiments on complex and pure background images show that our method attains state-of-the-art accuracy compared with volumetric and point set generation methods, particularly toward fine-grained details, and it works well for both complex backgrounds and multiple view angles.}, bibtype = {article}, author = {Zhang, Yang and Liu, Zhen and Liu, Tianpeng and Peng, Bo and Li, Xiang}, doi = {10.1109/ACCESS.2019.2914150}, journal = {IEEE Access} }
@article{ title = {Patch-based progressive 3D point set upsampling}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,Deep Learning,Vision + Graphics}, pages = {5951-5960}, volume = {2019-June}, id = {ad576187-6bef-353f-9c56-3a9c7bf4d0a5}, created = {2021-07-21T12:55:19.321Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.307Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yifan2019}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {We present a detail-driven deep neural network for point set upsampling. A high-resolution point set is essential for point-based rendering and surface reconstruction. Inspired by the recent success of neural image super-resolution techniques, we progressively train a cascade of patch-based upsampling networks on different levels of detail end-to-end. We propose a series of architectural design contributions that lead to a substantial performance boost. The effect of each technical contribution is demonstrated in an ablation study. Qualitative and quantitative experiments show that our method significantly outperforms the state-of-the-art learning-based and optimazation-based approaches, both in terms of handling low-resolution inputs and revealing high-fidelity details.}, bibtype = {article}, author = {Yifan, Wang and Wu, Shihao and Huang, Hui and Cohen-Or, Daniel and Sorkine-Hornung, Olga}, doi = {10.1109/CVPR.2019.00611}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Plant phenotyping by deep-learning-based planner for multi-robots}, type = {article}, year = {2019}, keywords = {Agricultural automation,Computer vision for automation,Multi-robot systems}, pages = {3113-3120}, volume = {4}, publisher = {IEEE}, id = {cf65eba5-875a-3636-9bb4-a5312644bdef}, created = {2021-07-26T12:19:39.679Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:34.062Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5439d198-93d5-4603-a7ce-201d423f231e,520159f7-1eb4-4d90-925c-ce42ce7fb9d4}, private_publication = {false}, abstract = {Manual plant phenotyping is slow, error prone, and labor intensive. In this letter, we present an automated robotic system for fast, precise, and noninvasive measurements using a new deep-learning-based next-best view planning pipeline. Specifically, we first use a deep neural network to estimate a set of candidate voxels for the next scanning. Next, we cast rays from these voxels to determine the optimal viewpoints. We empirically evaluate our method in simulations and real-world robotic experiments with up to three robotic arms to demonstrate its efficiency and effectiveness. One advantage of our new pipeline is that it can be easily extended to a multi-robot system where multiple robots move simultaneously according to the planned motions. Our system significantly outperforms the single robot in flexibility and planning time. High-throughput phenotyping can be made practically.}, bibtype = {article}, author = {Wu, Chenming and Zeng, Rui and Pan, Jia and Wang, Charlie C.L. and Liu, Yong Jin}, doi = {10.1109/LRA.2019.2924125}, journal = {IEEE Robotics and Automation Letters}, number = {4} }
@article{ title = {Transferability of Spectral Graph Convolutional Neural Networks}, type = {article}, year = {2019}, pages = {1-45}, websites = {http://arxiv.org/abs/1907.12972}, id = {de664cd5-47aa-39d1-a8f8-e8fafa765a53}, created = {2021-08-04T09:51:19.876Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.720Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Levie2019}, private_publication = {false}, abstract = {This paper focuses on spectral graph convolutional neural networks (ConvNets), where filters are defined as elementwise multiplication in the frequency domain of a graph. In machine learning settings where the dataset consists of signals defined on many different graphs, the trained ConvNet should generalize to signals on graphs unseen in the training set. It is thus important to transfer ConvNets between graphs. Transferability, which is a certain type of generalization capability, can be loosely defined as follows: if two graphs describe the same phenomenon, then a single filter or ConvNet should have similar repercussions on both graphs. This paper aims at debunking the common misconception that spectral filters are not transferable. We show that if two graphs discretize the same "continuous" space, then a spectral filter or ConvNet has approximately the same repercussion on both graphs. Our analysis is more permissive than the standard analysis. Transferability is typically described as the robustness of the filter to small graph perturbations and re-indexing of the vertices. Our analysis accounts also for large graph perturbations. We prove transferability between graphs that can have completely different dimensions and topologies, only requiring that both graphs discretize the same underlying space in some generic sense.}, bibtype = {article}, author = {Levie, Ron and Huang, Wei and Bucci, Lorenzo and Bronstein, Michael M. and Kutyniok, Gitta} }
@article{ title = {CayleyNets: Graph Convolutional Neural Networks with Complex Rational Spectral Filters}, type = {article}, year = {2019}, keywords = {Geometric deep learning,graph convolution neural networks,graph giltering,spectral approaches}, pages = {97-109}, volume = {67}, publisher = {IEEE}, id = {4511bc8f-690b-3428-880a-97f39c7d9ceb}, created = {2021-08-04T09:51:19.994Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.718Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Levie2019a}, private_publication = {false}, abstract = {The rise of graph-structured data such as social networks, regulatory networks, citation graphs, and functional brain networks, in combination with resounding success of deep learning in various applications, has brought the interest in generalizing deep learning models to non-Euclidean domains. In this paper, we introduce a new spectral domain convolutional architecture for deep learning on graphs. The core ingredient of our model is a new class of parametric rational complex functions (Cayley polynomials) allowing to efficiently compute spectral filters on graphs that specialize on frequency bands of interest. Our model generates rich spectral filters that are localized in space, scales linearly with the size of the input data for sparsely connected graphs, and can handle different constructions of Laplacian operators. Extensive experimental results show the superior performance of our approach, in comparison to other spectral domain convolutional architectures, on spectral image classification, community detection, vertex classification, and matrix completion tasks.}, bibtype = {article}, author = {Levie, Ron and Monti, Federico and Bresson, Xavier and Bronstein, Michael M.}, doi = {10.1109/TSP.2018.2879624}, journal = {IEEE Transactions on Signal Processing}, number = {1} }
@article{ title = {Octree guided CNN with spherical kernels for 3D point clouds}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,Categorization,Deep Learning,Grouping and Shape,Recognition: Detection,Retrieval,Segmentation}, pages = {9623-9632}, volume = {2019-June}, id = {0d3a65bd-724e-3cb3-9f91-0a9d2048fc96}, created = {2021-08-04T13:05:07.914Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:19.637Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We propose an octree guided neural network architecture and spherical convolutional kernel for machine learning from arbitrary 3D point clouds. The network architecture capitalizes on the sparse nature of irregular point clouds,and hierarchically coarsens the data representation with space partitioning. At the same time, the proposed spherical kernels systematically quantize point neighborhoods to identify local geometric structures in the data, while maintaining the properties of translation-invariance and asymmetry. We specify spherical kernels with the help of network neurons that in turn are associated with spatial locations.We exploit this association to avert dynamic kernel generation during network training that enables efficient learning with high resolution point clouds. The effectiveness of the proposed technique is established on the benchmark tasks of 3D object classification and segmentation, achieving competitive performance on ShapeNet and RueMonge2014 datasets.}, bibtype = {article}, author = {Lei, Huan and Akhtar, Naveed and Mian, Ajmal}, doi = {10.1109/CVPR.2019.00986}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Spectral-based Graph Convolutional Network for Directed Graphs}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1907.08990}, id = {ad62470e-af5e-3e9c-b180-252d7294ae64}, created = {2021-08-04T13:05:08.033Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-14T20:02:09.832Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ma2019}, private_publication = {false}, abstract = {Graph convolutional networks(GCNs) have become the most popular approaches for graph data in these days because of their powerful ability to extract features from graph. GCNs approaches are divided into two categories, spectral-based and spatial-based. As the earliest convolutional networks for graph data, spectral-based GCNs have achieved impressive results in many graph related analytics tasks. However, spectral-based models cannot directly work on directed graphs. In this paper, we propose an improved spectral-based GCN for the directed graph by leveraging redefined Laplacians to improve its propagation model. Our approach can work directly on directed graph data in semi-supervised nodes classification tasks. Experiments on a number of directed graph datasets demonstrate that our approach outperforms the state-of-the-art methods.}, bibtype = {article}, author = {Ma, Yi and Hao, Jianye and Yang, Yaodong and Li, Han and Jin, Junqi and Chen, Guangyong} }
@article{ title = {Learning localized generative models for 3D point clouds via graph convolution}, type = {article}, year = {2019}, pages = {1-15}, id = {10c22f7a-14cb-36d9-a7f2-37fdffda7b69}, created = {2021-08-04T13:05:08.167Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:36.525Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,8315fdc0-e3a9-47f0-9186-21b3433d86d2,ad97b399-756a-4674-becd-d6455b989a39}, private_publication = {false}, abstract = {Point clouds are an important type of geometric data and have widespread use in computer graphics and vision. However, learning representations for point clouds is particularly challenging due to their nature as being an unordered collection of points irregularly distributed in 3D space. Graph convolution, a generalization of the convolution operation for data defined over graphs, has been recently shown to be very successful at extracting localized features from point clouds in supervised or semi-supervised tasks such as classification or segmentation. This paper studies the unsupervised problem of a generative model exploiting graph convolution. We focus on the generator of a GAN and define methods for graph convolution when the graph is not known in advance as it is the very output of the generator. The proposed architecture learns to generate localized features that approximate graph embeddings of the output geometry. We also study the problem of defining an upsampling layer in the graph-convolutional generator, such that it learns to exploit a self-similarity prior on the data distribution to sample more effectively.}, bibtype = {article}, author = {Valsesia, Diego and Fracastoro, Giulia and Magli, Enrico}, journal = {7th International Conference on Learning Representations, ICLR 2019} }
@article{ title = {Hierarchical depthwise graph convolutional neural network for 3D semantic segmentation of point clouds}, type = {article}, year = {2019}, pages = {8152-8158}, volume = {2019-May}, id = {e53cf6d7-8a62-3d8f-93f7-2ad3d79d57bd}, created = {2021-08-04T13:05:08.171Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:20.458Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This paper proposes a hierarchical depthwise graph convolutional neural network (HDGCN) for point cloud semantic segmentation. The main chanllenge for learning on point clouds is to capture local structures or relationships. Graph convolution has the strong ability to extract local shape information from neighbors. Inspired by depthwise convolution, we propose a depthwise graph convolution which requires less memory consumption compared with the previous graph convolution. While depthwise graph convolution aggregates features channel-wisely, pointwise convolution is used to learn features across different channels. A customized block called DGConv is specially designed for local feature extraction based on depthwise graph convolution and pointwise convolution. The DGConv block can extract features from points and transfer features to neighbors while being invariant to different point orders. HDGCN is constructed by a series of DGConv blocks using a hierarchical structure which can extract both local and global features of point clouds. Experiments show that HDGCN achieves the state-of-the-art performance in the indoor dataset S3DIS and the outdoor dataset Paris-Lille-3D.}, bibtype = {article}, author = {Liang, Zhidong and Yang, Ming and Deng, Liuyuan and Wang, Chunxiang and Wang, Bing}, doi = {10.1109/ICRA.2019.8794052}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {How powerful are graph neural networks?}, type = {article}, year = {2019}, pages = {1-17}, id = {88358f50-7fe6-3130-9c2c-675b70ce4c9c}, created = {2021-08-20T07:55:13.973Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:19.962Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs) are an effective framework for representation learning of graphs. GNNs follow a neighborhood aggregation scheme, where the representation vector of a node is computed by recursively aggregating and transforming representation vectors of its neighboring nodes. Many GNN variants have been proposed and have achieved state-of-the-art results on both node and graph classification tasks. However, despite GNNs revolutionizing graph representation learning, there is limited understanding of their representational properties and limitations. Here, we present a theoretical framework for analyzing the expressive power of GNNs to capture different graph structures. Our results characterize the discriminative power of popular GNN variants, such as Graph Convolutional Networks and GraphSAGE, and show that they cannot learn to distinguish certain simple graph structures. We then develop a simple architecture that is provably the most expressive among the class of GNNs and is as powerful as the Weisfeiler-Lehman graph isomorphism test. We empirically validate our theoretical findings on a number of graph classification benchmarks, and demonstrate that our model achieves state-of-the-art performance.}, bibtype = {article}, author = {Xu, Keyulu and Jegelka, Stefanie and Hu, Weihua and Leskovec, Jure}, journal = {7th International Conference on Learning Representations, ICLR 2019} }
@article{ title = {Position-aware Graph Neural Networks}, type = {article}, year = {2019}, id = {614bb28d-e3c5-3d16-bf2f-43a2aef62ec3}, created = {2021-08-20T10:21:49.200Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:19.480Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {You, Jiaxuan and Ying, Rex and Leskovec, Jure} }
@article{ title = {Strategies for Pre-training Graph Neural Networks}, type = {article}, year = {2019}, pages = {1-22}, websites = {http://arxiv.org/abs/1905.12265}, id = {84c526b3-dd3f-3956-bec2-b380e5680a6a}, created = {2021-08-20T10:21:49.218Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-20T10:21:54.979Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Many applications of machine learning require a model to make accurate pre-dictions on test examples that are distributionally different from training ones, while task-specific labels are scarce during training. An effective approach to this challenge is to pre-train a model on related tasks where data is abundant, and then fine-tune it on a downstream task of interest. While pre-training has been effective in many language and vision domains, it remains an open question how to effectively use pre-training on graph datasets. In this paper, we develop a new strategy and self-supervised methods for pre-training Graph Neural Networks (GNNs). The key to the success of our strategy is to pre-train an expressive GNN at the level of individual nodes as well as entire graphs so that the GNN can learn useful local and global representations simultaneously. We systematically study pre-training on multiple graph classification datasets. We find that naive strategies, which pre-train GNNs at the level of either entire graphs or individual nodes, give limited improvement and can even lead to negative transfer on many downstream tasks. In contrast, our strategy avoids negative transfer and improves generalization significantly across downstream tasks, leading up to 9.4% absolute improvements in ROC-AUC over non-pre-trained models and achieving state-of-the-art performance for molecular property prediction and protein function prediction.}, bibtype = {article}, author = {Hu, Weihua and Liu, Bowen and Gomes, Joseph and Zitnik, Marinka and Liang, Percy and Pande, Vijay and Leskovec, Jure} }
@article{ title = {Exploiting edge features for graph neural networks}, type = {article}, year = {2019}, keywords = {Deep Learning,Others}, pages = {9203-9211}, volume = {2019-June}, id = {aa8190c7-052c-3f99-9231-0966a5e29e40}, created = {2021-08-20T10:21:49.323Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-12T13:34:19.459Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Edge features contain important information about graphs. However, current state-of-the-art neural network models designed for graph learning, eg, graph convolutional networks (GCN) and graph attention networks (GAT), inadequately utilize edge features, especially multi-dimensional edge features. In this paper, we build a new framework for a family of new graph neural network models that can more sufficiently exploit edge features, including those of undirected or multi-dimensional edges. The proposed framework can consolidate current graph neural network models, e.g., GCN and GAT. The proposed framework and new models have the following novelties: First, we propose to use doubly stochastic normalization of graph edge features instead of the commonly used row or symmetric normalization approaches used in current graph neural networks. Second, we construct new formulas for the operations in each individual layer so that they can handle multi-dimensional edge features. Third, for the proposed new framework, edge features are adaptive across network layers. As a result, our proposed new framework and new models are able to exploit a rich source of graph edge information. We apply our new models to graph node classification on several citation networks, whole graph classification, and regression on several molecular datasets. Compared with the current state-of-the-art methods, i.e., GCNs and GAT, our models obtain better performance, which testify to the importance of exploiting edge features in graph neural networks.}, bibtype = {article}, author = {Gong, Liyu and Cheng, Qiang}, doi = {10.1109/CVPR.2019.00943}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Pay Less Attention with Lightweight and Dynamic Convolutions}, type = {article}, year = {2019}, websites = {https://arxiv.org/abs/1901.10430v1}, month = {1}, publisher = {International Conference on Learning Representations, ICLR}, day = {29}, id = {a4bf96df-3972-3780-baf4-0bce55c21b88}, created = {2021-08-24T07:15:01.585Z}, accessed = {2021-08-24}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-24T07:15:04.707Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c509f25c-b687-4ab5-8859-72131b6658d3}, private_publication = {false}, abstract = {Self-attention is a useful mechanism to build generative models for language and images. It determines the importance of context elements by comparing each element to the current time step. In this paper, we show that a very lightweight convolution can perform competitively to the best reported self-attention results. Next, we introduce dynamic convolutions which are simpler and more efficient than self-attention. We predict separate convolution kernels based solely on the current time-step in order to determine the importance of context elements. The number of operations required by this approach scales linearly in the input length, whereas self-attention is quadratic. Experiments on large-scale machine translation, language modeling and abstractive summarization show that dynamic convolutions improve over strong self-attention models. On the WMT'14 English-German test set dynamic convolutions achieve a new state of the art of 29.7 BLEU.}, bibtype = {article}, author = {Wu, Felix and Fan, Angela and Baevski, Alexei and Dauphin, Yann N. and Auli, Michael}, journal = {7th International Conference on Learning Representations, ICLR 2019} }
@article{ title = {Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context}, type = {article}, year = {2019}, pages = {2978-2988}, websites = {https://arxiv.org/abs/1901.02860v3}, month = {1}, publisher = {Association for Computational Linguistics (ACL)}, day = {9}, id = {472812df-b1d0-39ea-ab85-0550ea0cefe0}, created = {2021-08-24T07:15:44.709Z}, accessed = {2021-08-24}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-24T07:15:48.618Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c509f25c-b687-4ab5-8859-72131b6658d3}, private_publication = {false}, abstract = {Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably coherent, novel text articles with thousands of tokens. Our code, pretrained models, and hyperparameters are available in both Tensorflow and PyTorch.}, bibtype = {article}, author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan}, journal = {ACL 2019 - 57th Annual Meeting of the Association for Computational Linguistics, Proceedings of the Conference} }
@article{ title = {XLNet: Generalized Autoregressive Pretraining for Language Understanding}, type = {article}, year = {2019}, volume = {32}, websites = {https://arxiv.org/abs/1906.08237v2}, month = {6}, publisher = {Neural information processing systems foundation}, day = {19}, id = {ed18f9c6-a45b-38ef-9dd5-2538d69b3220}, created = {2021-08-24T07:17:41.452Z}, accessed = {2021-08-24}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-24T07:17:44.148Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c509f25c-b687-4ab5-8859-72131b6658d3}, private_publication = {false}, abstract = {With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves better performance than pretraining approaches based on autoregressive language modeling. However, relying on corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large margin, including question answering, natural language inference, sentiment analysis, and document ranking.}, bibtype = {article}, author = {Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V.}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Spherical fractal convolutional neural networks for point cloud recognition}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,Categorization,Deep Learning,Recognition: Detection,Representation Lea,Retrieval}, pages = {452-460}, volume = {2019-June}, id = {ac12dd47-6201-347e-800b-142848c871d4}, created = {2021-08-29T22:27:22.077Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:22.937Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We present a generic, flexible and 3D rotation invariant framework based on spherical symmetry for point cloud recognition. By introducing regular icosahedral lattice and its fractals to approximate and discretize sphere, convolution can be easily implemented to process 3D points. Based on the fractal structure, a hierarchical feature learning framework together with an adaptive sphere projection module is proposed to learn deep feature in an end-to-end manner. Our framework not only inherits the strong representation power and generalization capability from convolutional neural networks for image recognition, but also extends CNN to learn robust feature resistant to rotations and perturbations. The proposed model is effective yet robust. Comprehensive experimental study demonstrates that our approach can achieve competitive performance compared to state-of-the-art techniques on both 3D object classification and part segmentation tasks, meanwhile, outperform other rotation invariant models on rotated 3D object classification and retrieval tasks by a large margin.}, bibtype = {article}, author = {Rao, Yongming and Lu, Jiwen and Zhou, Jie}, doi = {10.1109/CVPR.2019.00054}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Dgcnn}, type = {article}, year = {2019}, keywords = {Classification,Point cloud,Segmentation}, pages = {Article 146}, volume = {38}, id = {3427af29-b336-3e97-9ca1-d01b134e8e5b}, created = {2021-08-30T07:09:34.460Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:19.813Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307}, private_publication = {false}, abstract = {Point clouds provide a flexible geometric representation suitable for countless applications in computer graphics; they also comprise the raw output of most 3D data acquisition devices. While hand-designed features on point clouds have long been proposed in graphics and vision, however, the recent overwhelming success of convolutional neural networks (CNNs) for image analysis suggests the value of adapting insight from CNN to the point cloud world. Point clouds inherently lack topological information, so designing a model to recover topology can enrich the representation power of point clouds. To this end, we propose a new neural network module dubbed EdgeConv suitable for CNN-based high-level tasks on point clouds, including classification and segmentation. EdgeConv acts on graphs dynamically computed in each layer of the network. It is differentiable and can be plugged into existing architectures. Compared to existing modules operating in extrinsic space or treating each point independently, EdgeConv has several appealing properties: It incorporates local neighborhood information; it can be stacked applied to learn global shape properties; and in multi-layer systems affinity in feature space captures semantic characteristics over potentially long distances in the original embedding. We show the performance of our model on standard benchmarks, including ModelNet40, ShapeNetPart, and S3DIS.}, bibtype = {article}, author = {Wang, Yue and Sun, Yongbin and Liu, Ziwei and Sarma, Sanjay E. and Bronstein, Michael M. and Solomon, Justin M}, journal = {ACM Transactions on Graphics}, number = {5} }
@article{ title = {An Introduction to Variational Autoencoders}, type = {article}, year = {2019}, id = {9afdb9ca-c587-36a7-8b22-7d28ab2049ed}, created = {2021-08-30T18:48:39.023Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.790Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Welling2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8efc2fe0-ed07-4348-a865-9f1a22b45934}, private_publication = {false}, bibtype = {article}, author = {Kingma, Diederik P. and Welling, Max}, doi = {10.1561/XXXXXXXXX.Diederik} }
@article{ title = {DeepGCNs: Can GCNs go as deep as CNNs?}, type = {article}, year = {2019}, pages = {9266-9275}, volume = {2019-Octob}, id = {98ec6080-2231-3862-a7a8-1f4b7cde2672}, created = {2021-09-01T07:41:55.428Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:38:39.293Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,a6fefa10-ad39-4ee5-850c-dcbd4fed6307,13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Convolutional Neural Networks (CNNs) achieve impressive performance in a wide variety of fields. Their success benefited from a massive boost when very deep CNN models were able to be reliably trained. Despite their merits, CNNs fail to properly address problems with non-Euclidean data. To overcome this challenge, Graph Convolutional Networks (GCNs) build graphs to represent non-Euclidean data, borrow concepts from CNNs, and apply them in training. GCNs show promising results, but they are usually limited to very shallow models due to the vanishing gradient problem. As a result, most state-of-the-art GCN models are no deeper than 3 or 4 layers. In this work, we present new ways to successfully train very deep GCNs. We do this by borrowing concepts from CNNs, specifically residual/dense connections and dilated convolutions, and adapting them to GCN architectures. Extensive experiments show the positive effect of these deep GCN frameworks. Finally, we use these new concepts to build a very deep 56-layer GCN, and show how it significantly boosts performance (+3.7% mIoU over state-of-the-art) in the task of point cloud semantic segmentation. We believe that the community can greatly benefit from this work, as it opens up many opportunities for advancing GCN-based research.}, bibtype = {article}, author = {Li, Guohao and Muller, Matthias and Thabet, Ali and Ghanem, Bernard}, doi = {10.1109/ICCV.2019.00936}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {DeepGCNs: Can GCNs go as deep as CNNs?}, type = {article}, year = {2019}, pages = {9266-9275}, volume = {2019-Octob}, id = {d560079e-752c-341e-89f8-ce081395179a}, created = {2021-09-01T07:41:55.496Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-09T14:47:58.110Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,a6fefa10-ad39-4ee5-850c-dcbd4fed6307,13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {Convolutional Neural Networks (CNNs) achieve impressive performance in a wide variety of fields. Their success benefited from a massive boost when very deep CNN models were able to be reliably trained. Despite their merits, CNNs fail to properly address problems with non-Euclidean data. To overcome this challenge, Graph Convolutional Networks (GCNs) build graphs to represent non-Euclidean data, borrow concepts from CNNs, and apply them in training. GCNs show promising results, but they are usually limited to very shallow models due to the vanishing gradient problem. As a result, most state-of-the-art GCN models are no deeper than 3 or 4 layers. In this work, we present new ways to successfully train very deep GCNs. We do this by borrowing concepts from CNNs, specifically residual/dense connections and dilated convolutions, and adapting them to GCN architectures. Extensive experiments show the positive effect of these deep GCN frameworks. Finally, we use these new concepts to build a very deep 56-layer GCN, and show how it significantly boosts performance (+3.7% mIoU over state-of-the-art) in the task of point cloud semantic segmentation. We believe that the community can greatly benefit from this work, as it opens up many opportunities for advancing GCN-based research.}, bibtype = {article}, author = {Li, Guohao and Muller, Matthias and Thabet, Ali and Ghanem, Bernard}, doi = {10.1109/ICCV.2019.00936}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {4} }
@article{ title = {The Evolved Transformer}, type = {article}, year = {2019}, pages = {10315-10328}, volume = {2019-June}, websites = {https://arxiv.org/abs/1901.11117v4}, month = {1}, publisher = {International Machine Learning Society (IMLS)}, day = {30}, id = {9f0f1caf-b8f7-3cf1-b14e-6f71ebf080b7}, created = {2021-09-01T08:08:58.587Z}, accessed = {2021-09-01}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-01T08:09:01.177Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Recent works have highlighted the strength of the Transformer architecture on sequence tasks while, at the same time, neural architecture search (NAS) has begun to outperform human-designed models. Our goal is to apply NAS to search for a better alternative to the Transformer. We first construct a large search space inspired by the recent advances in feed-forward sequence models and then run evolutionary architecture search with warm starting by seeding our initial population with the Transformer. To directly search on the computationally expensive WMT 2014 English-German translation task, we develop the Progressive Dynamic Hurdles method, which allows us to dynamically allocate more resources to more promising candidate models. The architecture found in our experiments -- the Evolved Transformer -- demonstrates consistent improvement over the Transformer on four well-established language tasks: WMT 2014 English-German, WMT 2014 English-French, WMT 2014 English-Czech and LM1B. At a big model size, the Evolved Transformer establishes a new state-of-the-art BLEU score of 29.8 on WMT'14 English-German; at smaller sizes, it achieves the same quality as the original "big" Transformer with 37.6% less parameters and outperforms the Transformer by 0.7 BLEU at a mobile-friendly model size of 7M parameters.}, bibtype = {article}, author = {So, David R. and Liang, Chen and Le, Quoc V.}, journal = {36th International Conference on Machine Learning, ICML 2019} }
@article{ title = {Hypergraph neural networks}, type = {article}, year = {2019}, pages = {3558-3565}, id = {a942c4ca-9be3-3041-9787-ad006e039718}, created = {2021-09-02T06:33:40.755Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:23.108Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307}, private_publication = {false}, abstract = {In this paper, we present a hypergraph neural networks (HGNN) framework for data representation learning, which can encode high-order data correlation in a hypergraph struc ture. Confronting the challenges of learning representation for complex data in real practice, we propose to incorpo rate such data structure in a hypergraph, which is more flexi ble on data modeling, especially when dealing with complex data. In this method, a hyperedge convolution operation is designed to handle the data correlation during representation learning. In this way, traditional hypergraph learning proce dure can be conducted using hyperedge convolution opera tions efficiently. HGNN is able to learn the hidden layer rep resentation considering the high-order data structure, which is a general framework considering the complex data correla tions. We have conducted experiments on citation network classification and visual object recognition tasks and com pared HGNN with graph convolutional networks and other traditional methods. Experimental results demonstrate that the proposed HGNN method outperforms recent state-of-the-art methods. We can also reveal from the results that the pro posed HGNN is superior when dealing with multi-modal data compared with existing methods.}, bibtype = {article}, author = {Feng, Yifan and You, Haoxuan and Zhang, Zizhao and Ji, Rongrong and Gao, Yue}, doi = {10.1609/aaai.v33i01.33013558}, journal = {33rd AAAI Conference on Artificial Intelligence, AAAI 2019, 31st Innovative Applications of Artificial Intelligence Conference, IAAI 2019 and the 9th AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2019} }
@article{ title = {Unsupervised multi-task feature learning on point clouds}, type = {article}, year = {2019}, pages = {8159-8170}, volume = {2019-Octob}, id = {212348d5-64b7-3316-a2a6-a964b244e80a}, created = {2021-09-02T06:33:40.868Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:24.217Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We introduce an unsupervised multi-task model to jointly learn point and shape features on point clouds. We define three unsupervised tasks including clustering, reconstruction, and self-supervised classification to train a multi-scale graph-based encoder. We evaluate our model on shape classification and segmentation benchmarks. The results suggest that it outperforms prior state-of-the-art unsupervised models: In the ModelNet40 classification task, it achieves an accuracy of 89.1% and in ShapeNet segmentation task, it achieves an mIoU of 68.2 and accuracy of 88.6%.}, bibtype = {article}, author = {Hassani, Kaveh and Haley, Mike}, doi = {10.1109/ICCV.2019.00825}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Clusternet: Deep hierarchical cluster network with rigorously rotation-invariant representation for point cloud analysis}, type = {article}, year = {2019}, keywords = {Categorization,Grouping and Shape,Recognition: Detection,Retrieval,Segmentation}, pages = {4989-4997}, volume = {2019-June}, id = {9488e0f0-9d4e-3cdb-8a03-3dd2d9b7d396}, created = {2021-09-02T06:33:40.871Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:25.920Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {Current neural networks for 3D object recognition are vulnerable to 3D rotation. Existing works mostly rely on massive amounts of rotation-augmented data to alleviate the problem, which lacks solid guarantee of the 3D rotation invariance. In this paper, we address the issue by introducing a novel point cloud representation that can be mathematically proved rigorously rotation-invariant, i.e., identical point clouds in different orientations are unified as a unique and consistent representation. Moreover, the proposed representation is conditional information-lossless, because it retains all necessary information of point cloud except for orientation information. In addition, the proposed representation is complementary with existing network architectures for point cloud and fundamentally improves their robustness against rotation transformation. Finally, we propose a deep hierarchical cluster network called ClusterNet to better adapt to the proposed representation. We employ hierarchical clustering to explore and exploit the geometric structure of point cloud, which is embedded in a hierarchical structure tree. Extensive experimental results have shown that our proposed method greatly outperforms the state-of-the-arts in rotation robustness on rotation-augmented 3D object classification benchmarks.}, bibtype = {article}, author = {Chen, Chao and Li, Guanbin and Xu, Ruijia and Chen, Tianshui and Wang, Meng and Lin, Liang}, doi = {10.1109/CVPR.2019.00513}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, number = {61602533} }
@article{ title = {Linked Dynamic Graph CNN: Learning on Point Cloud via Linking Hierarchical Features}, type = {article}, year = {2019}, pages = {1-8}, websites = {http://arxiv.org/abs/1904.10014}, id = {18add943-40d1-3d93-b6cc-e8391d2e447c}, created = {2021-09-02T06:33:40.874Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:22.078Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Learning on point cloud is eagerly in demand because the point cloud is a common type of geometric data and can aid robots to understand environments robustly. However, the point cloud is sparse, unstructured, and unordered, which cannot be recognized accurately by a traditional convolutional neural network (CNN) nor a recurrent neural network (RNN). Fortunately, a graph convolutional neural network (Graph CNN) can process sparse and unordered data. Hence, we propose a linked dynamic graph CNN (LDGCNN) to classify and segment point cloud directly in this paper. We remove the transformation network, link hierarchical features from dynamic graphs, freeze feature extractor, and retrain the classifier to increase the performance of LDGCNN. We explain our network using theoretical analysis and visualization. Through experiments, we show that the proposed LDGCNN achieves state-of-art performance on two standard datasets: ModelNet40 and ShapeNet.}, bibtype = {article}, author = {Zhang, Kuangen and Hao, Ming and Wang, Jing and de Silva, Clarence W. and Fu, Chenglong} }
@article{ title = {Dynamic points agglomeration for hierarchical point sets learning}, type = {article}, year = {2019}, pages = {7545-7554}, volume = {2019-Octob}, id = {d2dc7c98-df4d-32d6-bebe-0d4309ac1cf9}, created = {2021-09-02T06:33:40.982Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:21.144Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {07e07de9-bcac-4934-a82b-d0aff540e56d}, private_publication = {false}, abstract = {Many previous works on point sets learning achieve excellent performance with hierarchical architecture. Their strategies towards points agglomeration, however, only perform points sampling and grouping in original Euclidean space in a fixed way. These heuristic and task-irrelevant strategies severely limit their ability to adapt to more varied scenarios. To this end, we develop a novel hierarchical point sets learning architecture, with dynamic points agglomeration. By exploiting the relation of points in semantic space, a module based on graph convolution network is designed to learn a soft points cluster agglomeration. We construct a hierarchical architecture that gradually agglomerates points by stacking this learnable and lightweight module. In contrast to fixed points agglomeration strategy, our method can handle more diverse situations robustly and efficiently. Moreover, we propose a parameter sharing scheme for reducing memory usage and computational burden induced by the agglomeration module. Extensive experimental results on several point cloud analytic tasks, including classification and segmentation, well demonstrate the superior performance of our dynamic hierarchical learning framework over current state-of-the-art methods.}, bibtype = {article}, author = {Liu, Jinxian and Ni, Bingbing and Li, Caiyuan and Yang, Jiancheng and Tian, Qi}, doi = {10.1109/ICCV.2019.00764}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Generating Long Sequences with Sparse Transformers}, type = {article}, year = {2019}, websites = {https://arxiv.org/abs/1904.10509v1}, month = {4}, day = {23}, id = {454dd0b7-73d2-3c25-8e02-db7e7e07de48}, created = {2021-09-03T07:12:51.573Z}, accessed = {2021-09-03}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T07:12:55.333Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Transformers are powerful sequence models, but require time and memory that grows quadratically with the sequence length. In this paper we introduce sparse factorizations of the attention matrix which reduce this to $O(n \sqrtn)$. We also introduce a) a variation on architecture and initialization to train deeper networks, b) the recomputation of attention matrices to save memory, and c) fast attention kernels for training. We call networks with these changes Sparse Transformers, and show they can model sequences tens of thousands of timesteps long using hundreds of layers. We use the same architecture to model images, audio, and text from raw bytes, setting a new state of the art for density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate unconditional samples that demonstrate global coherence and great diversity, and show it is possible in principle to use self-attention to model sequences of length one million or more.}, bibtype = {article}, author = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya} }
@article{ title = {On the Relationship between Self-Attention and Convolutional Layers}, type = {article}, year = {2019}, websites = {https://arxiv.org/abs/1911.03584v2}, month = {11}, day = {8}, id = {bcc7dd5f-36a3-3457-91d2-7045ed31e25b}, created = {2021-09-06T08:38:40.551Z}, accessed = {2021-09-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T08:38:43.064Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Recent trends of incorporating attention mechanisms in vision have led researchers to reconsider the supremacy of convolutional layers as a primary building block. Beyond helping CNNs to handle long-range dependencies, Ramachandran et al. (2019) showed that attention can completely replace convolution and achieve state-of-the-art performance on vision tasks. This raises the question: do learned attention layers operate similarly to convolutional layers? This work provides evidence that attention layers can perform convolution and, indeed, they often learn to do so in practice. Specifically, we prove that a multi-head self-attention layer with sufficient number of heads is at least as expressive as any convolutional layer. Our numerical experiments then show that self-attention layers attend to pixel-grid patterns similarly to CNN layers, corroborating our analysis. Our code is publicly available.}, bibtype = {article}, author = {Cordonnier, Jean-Baptiste and Loukas, Andreas and Jaggi, Martin} }
@article{ title = {Stand-Alone Self-Attention in Vision Models}, type = {article}, year = {2019}, volume = {32}, websites = {https://arxiv.org/abs/1906.05909v1}, month = {6}, publisher = {Neural information processing systems foundation}, day = {13}, id = {21cef052-1891-3705-89c8-87b950dfb12b}, created = {2021-09-06T08:39:26.035Z}, accessed = {2021-09-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T08:39:28.315Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Convolutions are a fundamental building block of modern computer vision systems. Recent approaches have argued for going beyond convolutions in order to capture long-range dependencies. These efforts focus on augmenting convolutional models with content-based interactions, such as self-attention and non-local means, to achieve gains on a number of vision tasks. The natural question that arises is whether attention can be a stand-alone primitive for vision models instead of serving as just an augmentation on top of convolutions. In developing and testing a pure self-attention vision model, we verify that self-attention can indeed be an effective stand-alone layer. A simple procedure of replacing all instances of spatial convolutions with a form of self-attention applied to ResNet model produces a fully self-attentional model that outperforms the baseline on ImageNet classification with 12% fewer FLOPS and 29% fewer parameters. On COCO object detection, a pure self-attention model matches the mAP of a baseline RetinaNet while having 39% fewer FLOPS and 34% fewer parameters. Detailed ablation studies demonstrate that self-attention is especially impactful when used in later layers. These results establish that stand-alone self-attention is an important addition to the vision practitioner's toolbox.}, bibtype = {article}, author = {Ramachandran, Prajit and Parmar, Niki and Vaswani, Ashish and Bello, Irwan and Levskaya, Anselm and Shlens, Jonathon}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {3D high-resolution cardiac segmentation reconstruction from 2d views using conditional variational autoencoders}, type = {article}, year = {2019}, keywords = {3d segmentation reconstruction,Cardiac mr,Deep learning,Variational autoencoder}, pages = {1643-1646}, volume = {2019-April}, publisher = {IEEE}, id = {db650296-0d94-3f09-881e-6cfb488a50fb}, created = {2021-09-09T14:35:21.245Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.894Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Biffi2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Accurate segmentation of heart structures imaged by cardiac MR is key for the quantitative analysis of pathology. High-resolution 3D MR sequences enable whole-heart structural imaging but are time-consuming, expensive to acquire and they often require long breath holds that are not suitable for patients. Consequently, multiplanar breath-hold 2D cines sequences are standard practice but are disadvantaged by lack of whole-heart coverage and low through-plane resolution. To address this, we propose a conditional variational autoencoder architecture able to learn a generative model of 3D high-resolution left ventricular (LV) segmentations which is conditioned on three 2D LV segmentations of one short-axis and two long-axis images. By only employing these three 2D segmentations, our model can efficiently reconstruct the 3D high-resolution LV segmentation of a subject. When evaluated on 400 unseen healthy volunteers, our model yielded an average Dice score of 87. 92 \pm 0.15 and outperformed competing architectures (TL-net, Dice score =82.60\pm 0.23, p=2.2\cdot 10^-16).}, bibtype = {article}, author = {Biffi, Carlo and Cerrolaza, Juan J. and Tarroni, Giacomo and De Marvao, Antonio and Cook, Stuart A. and O'Regan, Declan P. and Rueckert, Daniel}, doi = {10.1109/ISBI.2019.8759328}, journal = {Proceedings - International Symposium on Biomedical Imaging}, number = {Isbi} }
@article{ title = {VV-net: Voxel VAE net with group convolutions for point cloud segmentation}, type = {article}, year = {2019}, pages = {8499-8507}, volume = {2019-Octob}, id = {dfed8f0c-9c57-35cb-9f79-77863f619082}, created = {2021-09-09T14:35:21.352Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.869Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Meng2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8efc2fe0-ed07-4348-a865-9f1a22b45934,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We present a novel algorithm for point cloud segmentation.Our approach transforms unstructured point clouds into regular voxel grids, and further uses a kernel-based interpolated variational autoencoder (VAE) architecture to encode the local geometry within each voxel.Traditionally, the voxel representation only comprises Boolean occupancy information, which fails to capture the sparsely distributed points within voxels in a compact manner. In order to handle sparse distributions of points, we further employ radial basis functions (RBF) to compute a local, continuous representation within each voxel. Our approach results in a good volumetric representation that effectively tackles noisy point cloud datasets and is more robust for learning. Moreover, we further introduce group equivariant CNN to 3D, by defining the convolution operator on a symmetry group acting on mathbbZ3 and its isomorphic sets. This improves the expressive capacity without increasing parameters, leading to more robust segmentation results.We highlight the performance on standard benchmarks and show that our approach outperforms state-of-the-art segmentation algorithms on the ShapeNet and S3DIS datasets.}, bibtype = {article}, author = {Meng, Hsien Yu and Gao, Lin and Lai, Yu Kun and Manocha, DInesh}, doi = {10.1109/ICCV.2019.00859}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Multi-angle point cloud-vae: Unsupervised feature learning for 3D point clouds from multiple angles by joint self-reconstruction and half-to-half prediction}, type = {article}, year = {2019}, pages = {10441-10450}, volume = {2019-Octob}, id = {3f4248ee-3739-3bd1-8ad9-2ba731f02f12}, created = {2021-09-09T14:35:21.376Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.943Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Han2019}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Unsupervised feature learning for point clouds has been vital for large-scale point cloud understanding. Recent deep learning based methods depend on learning global geometry from self-reconstruction. However, these methods are still suffering from ineffective learning of local geometry, which significantly limits the discriminability of learned features. To resolve this issue, we propose MAP-VAE to enable the learning of global and local geometry by jointly leveraging global and local self-supervision. To enable effective local self-supervision, we introduce multi-angle analysis for point clouds. In a multi-angle scenario, we first split a point cloud into a front half and a back half from each angle, and then, train MAP-VAE to learn to predict a back half sequence from the corresponding front half sequence. MAP-VAE performs this half-to-half prediction using RNN to simultaneously learn each local geometry and the spatial relationship among them. In addition, MAP-VAE also learns global geometry via self-reconstruction, where we employ a variational constraint to facilitate novel shape generation. The outperforming results in four shape analysis tasks show that MAP-VAE can learn more discriminative global or local features than the state-of-the-art methods.}, bibtype = {article}, author = {Han, Zhizhong and Wang, Xiyang and Liu, Yu Shen and Zwicker, Matthias}, doi = {10.1109/ICCV.2019.01054}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {Iccv} }
@article{ title = {Generating diverse high-fidelity images with VQ-VAE-2}, type = {article}, year = {2019}, pages = {1-11}, volume = {32}, id = {3673411c-710c-30ef-9eb6-5470d9890c8e}, created = {2021-09-14T08:42:48.722Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.691Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Razavi2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {We explore the use of Vector Quantized Variational AutoEncoder (VQ-VAE) models for large scale image generation. To this end, we scale and enhance the autoregressive priors used in VQ-VAE to generate synthetic samples of much higher coherence and fidelity than possible before. We use simple feed-forward encoder and decoder networks, making our model an attractive candidate for applications where the encoding and/or decoding speed is critical. Additionally, VQ-VAE requires sampling an autoregressive model only in the compressed latent space, which is an order of magnitude faster than sampling in the pixel space, especially for large images. We demonstrate that a multi-scale hierarchical organization of VQ-VAE, augmented with powerful priors over the latent codes, is able to generate samples with quality that rivals that of state of the art Generative Adversarial Networks on multifaceted datasets such as ImageNet, while not suffering from GAN's known shortcomings such as mode collapse and lack of diversity.}, bibtype = {article}, author = {Razavi, Ali and van den Oord, Aäron and Vinyals, Oriol}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS 2019} }
@article{ title = {Spatial Transformer for 3D Point Clouds}, type = {article}, year = {2019}, keywords = {3D detection,Convolution,Feature extraction,Measurement,Semantics,Shape,Task analysis,Three-dimensional displays,deformable,point cloud,segmentation,transformation}, websites = {https://arxiv.org/abs/1906.10887v4}, month = {6}, publisher = {IEEE Computer Society}, day = {26}, id = {87b47b02-ebb4-3785-aca5-6a6df1d6adc1}, created = {2021-09-15T11:24:33.381Z}, accessed = {2021-09-15}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-12T08:29:57.242Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4,11276190-b8fe-4c3a-a42f-f604438ad4db}, private_publication = {false}, abstract = {Deep neural networks are widely used for understanding 3D point clouds. At each point convolution layer, features are computed from local neighborhoods of 3D points and combined for subsequent processing in order to extract semantic information. Existing methods adopt the same individual point neighborhoods throughout the network layers, defined by the same metric on the fixed input point coordinates. This common practice is easy to implement but not necessarily optimal. Ideally, local neighborhoods should be different at different layers, as more latent information is extracted at deeper layers. We propose a novel end-to-end approach to learn different non-rigid transformations of the input point cloud so that optimal local neighborhoods can be adopted at each layer. We propose both linear (affine) and non-linear (projective and deformable) spatial transformers for 3D point clouds. With spatial transformers on the ShapeNet part segmentation dataset, the network achieves higher accuracy for all categories, with 8\% gain on earphones and rockets in particular. Our method also outperforms the state-of-the-art on other point cloud tasks such as classification, detection, and semantic segmentation. Visualizations show that spatial transformers can learn features more efficiently by dynamically altering local neighborhoods according to the geometry and semantics of 3D shapes in spite of their within-category variations. Our code is publicly available at https://github.com/samaonline/spatial-transformer-for-3d-point-clouds.}, bibtype = {article}, author = {Wang, Jiayun and Chakraborty, Rudrasis and Yu, Stella X.}, doi = {10.1109/tpami.2021.3070341}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence} }
@article{ title = {Deep closest point: Learning representations for point cloud registration}, type = {article}, year = {2019}, pages = {3522-3531}, volume = {2019-Octob}, id = {3a54d08e-17d3-3cb9-912d-d7f00c3b42eb}, created = {2021-09-16T07:10:13.949Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-28T07:20:31.412Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8}, private_publication = {false}, abstract = {Point cloud registration is a key problem for computer vision applied to robotics, medical imaging, and other applications. This problem involves finding a rigid transformation from one point cloud into another so that they align. Iterative Closest Point (ICP) and its variants provide simple and easily-implemented iterative methods for this task, but these algorithms can converge to spurious local optima. To address local optima and other difficulties in the ICP pipeline, we propose a learning-based method, titled Deep Closest Point (DCP), inspired by recent techniques in computer vision and natural language processing. Our model consists of three parts: A point cloud embedding network, an attention-based module combined with a pointer generation layer to approximate combinatorial matching, and a differentiable singular value decomposition (SVD) layer to extract the final rigid transformation. We train our model end-to-end on the ModelNet40 dataset and show in several settings that it performs better than ICP, its variants (e.g., Go-ICP, FGR), and the recently-proposed learning-based method PointNetLK. Beyond providing a state-of-the-art registration technique, we evaluate the suitability of our learned features transferred to unseen objects. We also provide preliminary analysis of our learned model to help understand whether domain-specific and/or global features facilitate rigid registration.}, bibtype = {article}, author = {Wang, Yue and Solomon, Justin}, doi = {10.1109/ICCV.2019.00362}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Graph attention convolution for point cloud semantic segmentation}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,Deep Learning,Grouping and Shape,Scene Analysis and Understanding,Segmentation}, pages = {10288-10297}, volume = {2019-June}, id = {c0f13d2b-5c5e-37b1-8fb3-965fc0d313a6}, created = {2021-09-16T07:10:14.124Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T05:20:39.274Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8,f8d4d36f-8136-4a85-8d1a-ceaffb92ddf1}, private_publication = {false}, abstract = {Standard convolution is inherently limited for semantic segmentation of point cloud due to its isotropy about features. It neglects the structure of an object, results in poor object delineation and small spurious regions in the segmentation result. This paper proposes a novel graph attention convolution (GAC), whose kernels can be dynamically carved into specific shapes to adapt to the structure of an object. Specifically, by assigning proper attentional weights to different neighboring points, GAC is designed to selectively focus on the most relevant part of them according to their dynamically learned features. The shape of the convolution kernel is then determined by the learned distribution of the attentional weights. Though simple, GAC can capture the structured features of point clouds for fine-grained segmentation and avoid feature contamination between objects. Theoretically, we provided a thorough analysis on the expressive capabilities of GAC to show how it can learn about the features of point clouds. Empirically, we evaluated the proposed GAC on challenging indoor and outdoor datasets and achieved the state-of-the-art results in both scenarios.}, bibtype = {article}, author = {Wang, Lei and Huang, Yuchun and Hou, Yaolin and Zhang, Shenman and Shan, Jie}, doi = {10.1109/CVPR.2019.01054}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Improving Graph Attention Networks with Large Margin-based Constraints}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1910.11945}, id = {fbe1412c-aa8f-32f6-8af8-037295a6611d}, created = {2021-09-17T05:16:59.816Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-17T10:42:57.292Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8}, private_publication = {false}, abstract = {Graph Attention Networks (GATs) are the state-of-the-art neural architecture for representation learning with graphs. GATs learn attention functions that assign weights to nodes so that different nodes have different influences in the feature aggregation steps. In practice, however, induced attention functions are prone to over-fitting due to the increasing number of parameters and the lack of direct supervision on attention weights. GATs also suffer from over-smoothing at the decision boundary of nodes. Here we propose a framework to address their weaknesses via margin-based constraints on attention during training. We first theoretically demonstrate the over-smoothing behavior of GATs and then develop an approach using constraint on the attention weights according to the class boundary and feature aggregation pattern. Furthermore, to alleviate the over-fitting problem, we propose additional constraints on the graph structure. Extensive experiments and ablation studies on common benchmark datasets demonstrate the effectiveness of our method, which leads to significant improvements over the previous state-of-the-art graph attention methods on all datasets.}, bibtype = {article}, author = {Wang, Guangtao and Ying, Rex and Huang, Jing and Leskovec, Jure} }
@article{ title = {Supplementary Material SUPPLEMENTARY MATERIAL Paluskar}, type = {article}, year = {2019}, pages = {1-10}, volume = {2}, id = {2dded606-08d5-3d58-adbc-d293ae8ce589}, created = {2021-09-21T05:20:39.025Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-28T07:20:31.729Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Fe, Co}, journal = {Mpb}, number = {c} }
@article{ title = {Graph attention convolution for point cloud semantic segmentation}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,Deep Learning,Grouping and Shape,Scene Analysis and Understanding,Segmentation}, pages = {10288-10297}, volume = {2019-June}, id = {f738b5e5-40a0-3895-be01-7ade9b8cacde}, created = {2021-09-27T07:36:18.507Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:04.312Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Standard convolution is inherently limited for semantic segmentation of point cloud due to its isotropy about features. It neglects the structure of an object, results in poor object delineation and small spurious regions in the segmentation result. This paper proposes a novel graph attention convolution (GAC), whose kernels can be dynamically carved into specific shapes to adapt to the structure of an object. Specifically, by assigning proper attentional weights to different neighboring points, GAC is designed to selectively focus on the most relevant part of them according to their dynamically learned features. The shape of the convolution kernel is then determined by the learned distribution of the attentional weights. Though simple, GAC can capture the structured features of point clouds for fine-grained segmentation and avoid feature contamination between objects. Theoretically, we provided a thorough analysis on the expressive capabilities of GAC to show how it can learn about the features of point clouds. Empirically, we evaluated the proposed GAC on challenging indoor and outdoor datasets and achieved the state-of-the-art results in both scenarios.}, bibtype = {article}, author = {Wang, Lei and Huang, Yuchun and Hou, Yaolin and Zhang, Shenman and Shan, Jie}, doi = {10.1109/CVPR.2019.01054}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Aligning latent spaces for 3D hand pose estimation}, type = {article}, year = {2019}, pages = {2335-2343}, volume = {2019-Octob}, id = {53b3616e-0b1d-3564-8df3-da39dc7989fb}, created = {2021-09-29T10:16:08.709Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.001Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yang2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Hand pose estimation from monocular RGB inputs is a highly challenging task. Many previous works for monocular settings only used RGB information for training despite the availability of corresponding data in other modalities such as depth maps. In this work, we propose to learn a joint latent representation that leverages other modalities as weak labels to boost the RGB-based hand pose estimator. By design, our architecture is highly flexible in embedding various diverse modalities such as heat maps, depth maps and point clouds. In particular, we find that encoding and decoding the point cloud of the hand surface can improve the quality of the joint latent representation. Experiments show that with the aid of other modalities during training, our proposed method boosts the accuracy of RGB-based hand pose estimation systems and significantly outperforms state-of-the-art on two public benchmarks.}, bibtype = {article}, author = {Yang, Linlin and Li, Shile and Lee, Dongheui and Yao, Angela}, doi = {10.1109/ICCV.2019.00242}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Multi-angle point cloud-vae: Unsupervised feature learning for 3D point clouds from multiple angles by joint self-reconstruction and half-to-half prediction}, type = {article}, year = {2019}, pages = {10441-10450}, volume = {2019-Octob}, id = {2195ba3b-22b4-3e63-be33-fc8b81d1c8fd}, created = {2021-09-29T10:16:08.855Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-08T11:04:44.052Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Unsupervised feature learning for point clouds has been vital for large-scale point cloud understanding. Recent deep learning based methods depend on learning global geometry from self-reconstruction. However, these methods are still suffering from ineffective learning of local geometry, which significantly limits the discriminability of learned features. To resolve this issue, we propose MAP-VAE to enable the learning of global and local geometry by jointly leveraging global and local self-supervision. To enable effective local self-supervision, we introduce multi-angle analysis for point clouds. In a multi-angle scenario, we first split a point cloud into a front half and a back half from each angle, and then, train MAP-VAE to learn to predict a back half sequence from the corresponding front half sequence. MAP-VAE performs this half-to-half prediction using RNN to simultaneously learn each local geometry and the spatial relationship among them. In addition, MAP-VAE also learns global geometry via self-reconstruction, where we employ a variational constraint to facilitate novel shape generation. The outperforming results in four shape analysis tasks show that MAP-VAE can learn more discriminative global or local features than the state-of-the-art methods.}, bibtype = {article}, author = {Han, Zhizhong and Wang, Xiyang and Liu, Yu Shen and Zwicker, Matthias}, doi = {10.1109/ICCV.2019.01054}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {Iccv} }
@article{ title = {End-to-end cad model retrieval and 9DOF alignment in 3D scans}, type = {article}, year = {2019}, pages = {2551-2560}, volume = {2019-Octob}, id = {50c30274-ffcf-36d3-a4b4-621088d7ad7e}, created = {2021-09-29T10:16:08.860Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:02.625Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Avetisyan2019}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434}, private_publication = {false}, abstract = {We present a novel, end-to-end approach to align CAD models to an 3D scan of a scene, enabling transformation of a noisy, incomplete 3D scan to a compact, CAD reconstruction with clean, complete object geometry. Our main contribution lies in formulating a differentiable Procrustes alignment that is paired with a symmetry-aware dense object correspondence prediction. To simultaneously align CAD models to all the objects of a scanned scene, our approach detects object locations, then predicts symmetry-aware dense object correspondences between scan and CAD geometry in a unified object space, as well as a nearest neighbor CAD model, both of which are then used to inform a differentiable Procrustes alignment. Our approach operates in a fully-convolutional fashion, enabling alignment of CAD models to the objects of a scan in a single forward pass. This enables our method to outperform state-of-the-art approaches by 19.04% for CAD model alignment to scans, with approximately 250x faster runtime than previous data-driven approaches.}, bibtype = {article}, author = {Avetisyan, Armen and Dai, Angela and Niessner, Matthias}, doi = {10.1109/ICCV.2019.00264}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {SCAN2CAD: Learning cad model alignment in rgb-d scans}, type = {article}, year = {2019}, keywords = {Categorization,Recognition: Detection,Retrieval,Scene Analysis and Understanding,Vision + Graphics}, pages = {2609-2618}, volume = {2019-June}, id = {2f7fa2da-4f5f-3178-91b4-b88d307aba9a}, created = {2021-09-29T10:16:08.862Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:02.806Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Avetisyan2019a}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434}, private_publication = {false}, abstract = {We present Scan2CAD, a novel data-driven method that learns to align clean 3D CAD models from a shape database to the noisy and incomplete geometry of a commodity RGB-D scan. For a 3D reconstruction of an indoor scene, our method takes as input a set of CAD models, and predicts a 9DoF pose that aligns each model to the underlying scan geometry. To tackle this problem, we create a new scan-to-CAD alignment dataset based on 1506 ScanNet scans with 97607 annotated keypoint pairs between 14225 CAD models from ShapeNet and their counterpart objects in the scans. Our method selects a set of representative keypoints in a 3D scan for which we find correspondences to the CAD geometry. To this end, we design a novel 3D CNN architecture that learns a joint embedding between real and synthetic objects, and from this predicts a correspondence heatmap. Based on these correspondence heatmaps, we formulate a variational energy minimization that aligns a given set of CAD models to the reconstruction. We evaluate our approach on our newly introduced Scan2CAD benchmark where we outperform both handcrafted feature descriptor as well as state-of-the-art CNN based methods by 21.39%.}, bibtype = {article}, author = {Avetisyan, Armen and Dahnert, Manuel and Dai, Angela and Savva, Manolis and Chang, Angel X. and Niebner, Matthias}, doi = {10.1109/CVPR.2019.00272}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {PU-GCN: Point Cloud Upsampling using Graph Convolutional Networks}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1912.03264}, id = {b1ebdaaf-6b1d-3d28-a97a-3422d8a9ea5f}, created = {2021-09-30T06:29:37.867Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-12-10T07:00:30.221Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b,13f2c27e-5827-43b2-8a2b-d62c62bc0ecc}, private_publication = {false}, abstract = {The effectiveness of learning-based point cloud upsampling pipelines heavily relies on the upsampling modules and feature extractors used therein. For the point upsampling module, we propose a novel model called NodeShuffle, which uses a Graph Convolutional Network (GCN) to better encode local point information from point neighborhoods. NodeShuffle is versatile and can be incorporated into any point cloud upsampling pipeline. Extensive experiments show how NodeShuffle consistently improves state-of-the-art upsampling methods. For feature extraction, we also propose a new multi-scale point feature extractor, called Inception DenseGCN. By aggregating features at multiple scales, this feature extractor enables further performance gain in the final upsampled point clouds. We combine Inception DenseGCN with NodeShuffle into a new point upsampling pipeline called PU-GCN. PU-GCN sets new state-of-art performance with much fewer parameters and more efficient inference.}, bibtype = {article}, author = {Qian, Guocheng and Abualshour, Abdulellah and Li, Guohao and Thabet, Ali and Ghanem, Bernard} }
@article{ title = {KPConv: Flexible and deformable convolution for point clouds}, type = {article}, year = {2019}, pages = {6410-6419}, volume = {2019-Octob}, id = {ff44a2e2-934f-35d6-8bc3-a5bb7ddce9fb}, created = {2021-10-12T07:10:34.223Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T06:49:26.172Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, notes = {KPConv - apply local deformation in the neighborhood of point convoution to enhance its learning capacity. <br/><br/>Introduces a new point convolution operator named <b>Kernel Point Convolution (KPConv)</b>.<br/><br/>Uses radius neoghborhoods intead of KNN}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {We present Kernel Point Convolution (KPConv), a new design of point convolution, i.e. that operates on point clouds without any intermediate representation. The convolution weights of KPConv are located in Euclidean space by kernel points, and applied to the input points close to them. Its capacity to use any number of kernel points gives KPConv more flexibility than fixed grid convolutions. Furthermore, these locations are continuous in space and can be learned by the network. Therefore, KPConv can be extended to deformable convolutions that learn to adapt kernel points to local geometry. Thanks to a regular subsampling strategy, KPConv is also efficient and robust to varying densities. Whether they use deformable KPConv for complex tasks, or rigid KPconv for simpler tasks, our networks outperform state-of-the-art classification and segmentation approaches on several datasets. We also offer ablation studies and visualizations to provide understanding of what has been learned by KPConv and to validate the descriptive power of deformable KPConv.}, bibtype = {article}, author = {Thomas, Hugues and Qi, Charles R. and Deschaud, Jean Emmanuel and Marcotegui, Beatriz and Goulette, Francois and Guibas, Leonidas}, doi = {10.1109/ICCV.2019.00651}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Do Image Classifiers Generalize Across Time?}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1906.02168}, id = {9d4f6431-41cd-3c64-9215-963d7a976389}, created = {2021-10-13T14:40:11.856Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:21.086Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We study the robustness of image classifiers to temporal perturbations derived from videos. As part of this study, we construct two datasets, ImageNet-Vid-Robust and YTBB-Robust , containing a total 57,897 images grouped into 3,139 sets of perceptually similar images. Our datasets were derived from ImageNet-Vid and Youtube-BB respectively and thoroughly re-annotated by human experts for image similarity. We evaluate a diverse array of classifiers pre-trained on ImageNet and show a median classification accuracy drop of 16 and 10 on our two datasets. Additionally, we evaluate three detection models and show that natural perturbations induce both classification as well as localization errors, leading to a median drop in detection mAP of 14 points. Our analysis demonstrates that perturbations occurring naturally in videos pose a substantial and realistic challenge to deploying convolutional neural networks in environments that require both reliable and low-latency predictions}, bibtype = {article}, author = {Shankar, Vaishaal and Dave, Achal and Roelofs, Rebecca and Ramanan, Deva and Recht, Benjamin and Schmidt, Ludwig} }
@article{ title = {Graph convolutional networks: a comprehensive review}, type = {article}, year = {2019}, keywords = {Aggregation mechanism,Deep learning,Graph convolutional networks,Graph representation learning,Spatial methods,Spectral methods}, volume = {6}, websites = {https://doi.org/10.1186/s40649-019-0069-y}, publisher = {Springer International Publishing}, id = {6c21d9d6-c646-3429-beb9-2cd4b05ac45a}, created = {2021-10-19T06:31:27.286Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-03T06:15:12.519Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, abstract = {Graphs naturally appear in numerous application domains, ranging from social analysis, bioinformatics to computer vision. The unique capability of graphs enables capturing the structural relations among data, and thus allows to harvest more insights compared to analyzing data in isolation. However, it is often very challenging to solve the learning problems on graphs, because (1) many types of data are not originally structured as graphs, such as images and text data, and (2) for graph-structured data, the underlying connectivity patterns are often complex and diverse. On the other hand, the representation learning has achieved great successes in many areas. Thereby, a potential solution is to learn the representation of graphs in a low-dimensional Euclidean space, such that the graph properties can be preserved. Although tremendous efforts have been made to address the graph representation learning problem, many of them still suffer from their shallow learning mechanisms. Deep learning models on graphs (e.g., graph neural networks) have recently emerged in machine learning and other related areas, and demonstrated the superior performance in various problems. In this survey, despite numerous types of graph neural networks, we conduct a comprehensive review specifically on the emerging field of graph convolutional networks, which is one of the most prominent graph deep learning models. First, we group the existing graph convolutional network models into two categories based on the types of convolutions and highlight some graph convolutional network models in details. Then, we categorize different graph convolutional networks according to the areas of their applications. Finally, we present several open challenges in this area and discuss potential directions for future research.}, bibtype = {article}, author = {Zhang, Si and Tong, Hanghang and Xu, Jiejun and Maciejewski, Ross}, doi = {10.1186/s40649-019-0069-y}, journal = {Computational Social Networks}, number = {1} }
@article{ title = {Eigenvalue and Generalized Eigenvalue Problems: Tutorial}, type = {article}, year = {2019}, pages = {1-8}, websites = {http://arxiv.org/abs/1903.11240}, id = {1a658c91-038a-3af4-ba9a-2416e80ddc50}, created = {2021-11-14T20:02:09.472Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-14T20:02:19.525Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ghojogh2019}, private_publication = {false}, abstract = {This paper is a tutorial for eigenvalue and generalized eigenvalue problems. We first introduce eigenvalue problem, eigen-decomposition (spectral decomposition), and generalized eigenvalue problem. Then, we mention the optimization problems which yield to the eigenvalue and generalized eigenvalue problems. We also provide examples from machine learning, including principal component analysis, kernel supervised principal component analysis, and Fisher discriminant analysis, which result in eigenvalue and generalized eigenvalue problems. Finally, we introduce the solutions to both eigenvalue and generalized eigenvalue problems.}, bibtype = {article}, author = {Ghojogh, Benyamin and Karray, Fakhri and Crowley, Mark}, number = {2} }
@article{ title = {On the Efficiency of a Point Cloud Autoencoder as a Geometric Representation for Shape Optimization}, type = {article}, year = {2019}, keywords = {evolutionary design optimization,free form deformation,geometric representation,point cloud autoencoder}, pages = {791-798}, id = {8694a246-0f8d-3e66-b3c4-19bf398902a9}, created = {2021-11-26T10:09:16.297Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.390Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rios2019}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {A crucial step for optimizing a system is to formulate the objective function, and part of it concerns the selection of the design parameters. One of the major goals is to achieve a fair trade-off between exploring feasible solutions in the design space and maintaining admissible computational effort. In order to achieve such balance in optimization problems with Computer Aided Engineering (CAE) models, the conventional constructive geometric representations are substituted by deformation methods, e.g. free form deformation, where the position of a few control points might be capable of handling large scale shape modifications. In light of the recent developments in the field of geometric deep learning, autoencoders have risen as a promising alternative for efficiently condensing high-dimensional models into compact representations. In this paper, we present a novel perspective on geometric deep learning modelsby exploring the applicability of the latent space of a point cloud autoencoder in shape optimization problems with evolutionary algorithms. Focusing on engineering applications, a target shape matching optimization is used as a surrogate to the computationally expensive CAE simulations required in engineering optimizations. Through the quality assessment of the solutions achieved in the optimization and further aspects, such as shape feasibility, point cloud autoencoders showed to be consistent and suitable geometric representations for such problems, adding a new perspective on the approaches for handling high-dimensional models to optimization tasks.}, bibtype = {article}, author = {Rios, Thiago and Sendhoff, Bernhard and Menzel, Stefan and Back, Thomas and Van Stein, Bas}, doi = {10.1109/SSCI44817.2019.9003161}, journal = {2019 IEEE Symposium Series on Computational Intelligence, SSCI 2019} }
@article{ title = {Scalability of Learning Tasks on 3D CAE Models Using Point Cloud Autoencoders}, type = {article}, year = {2019}, keywords = {computer aided engineering,deep learning,dimensionality reduction,geometry,sampling methods}, pages = {1367-1374}, id = {0f136852-8497-3c44-af64-fc1ebe5474ac}, created = {2021-11-26T10:09:16.303Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.534Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rios2019a}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Geometric Deep Learning (GDL) methods have recently gained interest as powerful, high-dimensional models for approaching various geometry processing tasks. However, training deep neural network models on geometric input requires considerable computational effort. Even more so, if one considers typical problem sizes found in application domains such as engineering tasks, where geometric data are often orders of magnitude larger than the inputs currently considered in GDL literature. Hence, an assessment of the scalability of the training task is necessary, where model and data set parameters can be mapped to the computational demand during training. The present paper therefore studies the effects of data set size and the number of free model parameters on the computational effort of training a Point Cloud Autoencoder (PC-AE). We further review pre-processing techniques to obtain efficient representations of high-dimensional inputs to the PC-AE and investigate the effects of these techniques on the information abstracted by the trained model. We perform these experiments on synthetic geometric data inspired by engineering applications using computing hardware with particularly recent graphics processing units (GPUs) with high memory specifications. The present study thus provides a comprehensive evaluation of how to scale geometric deep learning architectures to high-dimensional inputs to allow for an application of state-of-the-art deep learning methods in real-world tasks.}, bibtype = {article}, author = {Rios, Thiago and Wollstadt, Patricia and Stein, Bas Van and Back, Thomas and Xu, Zhao and Sendhoff, Bernhard and Menzel, Stefan}, doi = {10.1109/SSCI44817.2019.9002982}, journal = {2019 IEEE Symposium Series on Computational Intelligence, SSCI 2019} }
@article{ title = {CompoNet: Learning to generate the unseen by part synthesis and composition}, type = {article}, year = {2019}, pages = {8758-8767}, volume = {2019-Octob}, id = {d422bb1b-005b-3d54-9335-7a6270ad65cb}, created = {2021-12-14T07:19:58.943Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-21T08:21:35.727Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Schor2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Data-driven generative modeling has made remarkable progress by leveraging the power of deep neural networks. A reoccurring challenge is how to enable a model to generate a rich variety of samples from the entire target distribution, rather than only from a distribution confined to the training data. In other words, we would like the generative model to go beyond the observed samples and learn to generate ''unseen'', yet still plausible, data. In our work, we present CompoNet, a generative neural network for 2D or 3D shapes that is based on a part-based prior, where the key idea is for the network to synthesize shapes by varying both the shape parts and their compositions. Treating a shape not as an unstructured whole, but as a (re-)composable set of deformable parts, adds a combinatorial dimension to the generative process to enrich the diversity of the output, encouraging the generator to venture more into the ''unseen''. We show that our part-based model generates richer variety of plausible shapes compared with baseline generative models. To this end, we introduce two quantitative metrics to evaluate the diversity of a generative model and assess how well the generated data covers both the training data and unseen data from the same target distribution.}, bibtype = {article}, author = {Schor, Nadav and Katzir, Oren and Zhang, Hao and Cohen-Or, Daniel}, doi = {10.1109/ICCV.2019.00885}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Attacking Graph Convolutional Networks via Rewiring}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1906.03750}, id = {eabce7c9-d6e1-31cf-87b5-8320d9b80fdd}, created = {2022-01-05T09:23:15.557Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:25.572Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Neural Networks (GNNs) have boosted the performance of many graph related tasks such as node classification and graph classification. Recent researches show that graph neural networks are vulnerable to adversarial attacks, which deliberately add carefully created unnoticeable perturbation to the graph structure. The perturbation is usually created by adding/deleting a few edges, which might be noticeable even when the number of edges modified is small. In this paper, we propose a graph rewiring operation which affects the graph in a less noticeable way compared to adding/deleting edges. We then use reinforcement learning to learn the attack strategy based on the proposed rewiring operation. Experiments on real world graphs demonstrate the effectiveness of the proposed framework. To understand the proposed framework, we further analyze how its generated perturbation to the graph structure affects the output of the target model.}, bibtype = {article}, author = {Ma, Yao and Wang, Suhang and Derr, Tyler and Wu, Lingfei and Tang, Jiliang} }
@article{ title = {Can Adversarial Network Attack be Defended?}, type = {article}, year = {2019}, pages = {1-10}, websites = {http://arxiv.org/abs/1903.05994}, id = {7b3b4fc4-5cd0-3bd3-a969-a1787fd4d5f4}, created = {2022-01-05T09:23:15.830Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:31.809Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Machine learning has been successfully applied to complex network analysis in various areas, and graph neural networks (GNNs) based methods outperform others. Recently, adversarial attack on networks has attracted special attention since carefully crafted adversarial networks with slight perturbations on clean network may invalid lots of network applications, such as node classification, link prediction, and community detection etc. Such attacks are easily constructed with serious security threat to various analyze methods, including traditional methods and deep models. To the best of our knowledge, it is the first time that defense method against network adversarial attack is discussed. In this paper, we are interested in the possibility of defense against adversarial attack on network, and propose defense strategies for GNNs against attacks. First, we propose novel adversarial training strategies to improve GNNs' defensibility against attacks. Then, we analytically investigate the robustness properties for GNNs granted by the use of smooth defense, and propose two special smooth defense strategies: smoothing distillation and smoothing cross-entropy loss function. Both of them are capable of smoothing gradient of GNNs, and consequently reduce the amplitude of adversarial gradients, which benefits gradient masking from attackers. The comprehensive experiments show that our proposed strategies have great defensibility against different adversarial attacks on four real-world networks in different network analyze tasks.}, bibtype = {article}, author = {Chen, Jinyin and Wu, Yangyang and Lin, Xiang and Xuan, Qi} }
@article{ title = {Robust graph convolutional networks against adversarial attacks}, type = {article}, year = {2019}, keywords = {Adversarial Attacks,Deep Learning,Graph Convolutional Networks,Robustness}, pages = {1399-1407}, id = {16b24d05-f403-3367-9ad4-5e29a01399f0}, created = {2022-01-05T09:23:15.908Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:33.088Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph Convolutional Networks (GCNs) are an emerging type of neural network model on graphs which have achieved state-ofthe-art performance in the task of node classification. However, recent studies show that GCNs are vulnerable to adversarial attacks, i.e. small deliberate perturbations in graph structures and node attributes, which poses great challenges for applying GCNs to real world applications. How to enhance the robustness of GCNs remains a critical open problem. To address this problem, we propose Robust GCN (RGCN), a novel model that “fortifies” GCNs against adversarial attacks. Specifically, instead of representing nodes as vectors, our method adopts Gaussian distributions as the hidden representations of nodes in each convolutional layer. In this way, when the graph is attacked, our model can automatically absorb the effects of adversarial changes in the variances of the Gaussian distributions. Moreover, to remedy the propagation of adversarial attacks in GCNs, we propose a variance-based attention mechanism, i.e. assigning different weights to node neighborhoods according to their variances when performing convolutions. Extensive experimental results demonstrate that our proposed method can effectively improve the robustness of GCNs. On three benchmark graphs, our RGCN consistently shows a substantial gain in node classification accuracy compared with state-of-the-art GCNs against various adversarial attack strategies.}, bibtype = {article}, author = {Zhu, Dingyuan and Cui, Peng and Zhang, Ziwei and Zhu, Wenwu}, doi = {10.1145/3292500.3330851}, journal = {Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining} }
@article{ title = {Comparing and Detecting Adversarial Attacks for Graph Deep Learning}, type = {article}, year = {2019}, pages = {1-7}, websites = {https://www.kdd.in.tum.de/research/nettack/}, id = {13170769-6565-3b07-9b36-33d48968e46f}, created = {2022-01-05T09:23:16.205Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:54.286Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep learning models have achieved state-of-the-art performance in classifying nodes in graph-structured data. However, recent work has shown that these models are vulnerable to adversarial attacks. In particular, is is possible to adversarially perturb the graph structure and the node features in order to induce classification errors. In this paper, we study the effect of recently proposed attacks on graph models which incorporate structure exploration. We then propose a method for detecting attacks when they occur.}, bibtype = {article}, author = {Zhang, Yingxue and Hossain Khan, Sakif and Coates, Mark}, journal = {ICLR 2019 Workshop: Representation Learning on Graphs and Manifolds} }
@article{ title = {Indirect Adversarial Attacks via Poisoning Neighbors for Graph Convolutional Networks}, type = {article}, year = {2019}, keywords = {adversarial attack,data poisoning,graph convolutional neural network,node classification}, pages = {1395-1400}, publisher = {IEEE}, id = {f8a7df04-4efe-37b1-8a32-a5267497dc1e}, created = {2022-01-05T09:23:16.327Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:56.507Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph convolutional neural networks, which learn aggregations over neighbor nodes, have achieved great performance in node classification tasks. However, recent studies reported that such graph convolutional node classifier can be deceived by adversarial perturbations on graphs. Abusing graph convolutions, a node's classification result can be influenced by poisoning its neighbors. Given an attributed graph and a node classifier, how can we evaluate robustness against such indirect adversarial attacks? Can we generate strong adversarial perturbations which are effective on not only one-hop neighbors, but more far from the target? In this paper, we demonstrate that the node classifier can be deceived with high-confidence by poisoning just a single node even two-hops or more far from the target. Towards achieving the attack, we propose a new approach which searches smaller perturbations on just a single node far from the target. In our experiments, our proposed method shows 99% attack success rate within two-hops from the target in two datasets. We also demonstrate that m -layer graph convolutional neural networks have chance to be deceived by our indirect attack within m-hop neighbors. The proposed attack can be used as a benchmark in future defense attempts to develop graph convolutional neural networks with having adversary robustness.}, bibtype = {article}, author = {Takahashi, Tsubasa}, doi = {10.1109/BigData47090.2019.9006004}, journal = {Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019} }
@article{ title = {Adversarial examples for graph data: Deep insights into attack and defense}, type = {article}, year = {2019}, pages = {4816-4823}, volume = {2019-Augus}, id = {4e1ab7b8-d6ba-3199-9e67-dd24f3726ed0}, created = {2022-01-05T09:23:16.473Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:03.980Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph deep learning models, such as graph convolutional networks (GCN) achieve state-of-the-art performance for tasks on graph data. However, similar to other deep learning models, graph deep learning models are susceptible to adversarial attacks. However, compared with non-graph data the discrete nature of the graph connections and features provide unique challenges and opportunities for adversarial attacks and defenses. In this paper, we propose techniques for both an adversarial attack and a defense against adversarial attacks. Firstly, we show that the problem of discrete graph connections and the discrete features of common datasets can be handled by using the integrated gradient technique that accurately determines the effect of changing selected features or edges while still benefiting from parallel computations. In addition, we show that an adversarially manipulated graph using a targeted attack statistically differs from un-manipulated graphs. Based on this observation, we propose a defense approach which can detect and recover a potential adversarial perturbation. Our experiments on a number of datasets show the effectiveness of the proposed techniques.}, bibtype = {article}, author = {Wu, Huijun and Wang, Chen and Tyshetskiy, Yuriy and Docherty, Andrew and Lu, Kai and Zhu, Liming}, doi = {10.24963/ijcai.2019/669}, journal = {IJCAI International Joint Conference on Artificial Intelligence} }
@article{ title = {Topology attack and defense for graph neural networks: An optimization perspective}, type = {article}, year = {2019}, pages = {3961-3967}, volume = {2019-Augus}, id = {c344ea77-3375-3238-8446-364bd94e4e0c}, created = {2022-01-05T09:23:16.636Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:01.954Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Graph neural networks (GNNs) which apply the deep neural networks to graph data have achieved significant performance for the task of semi-supervised node classification. However, only few work has addressed the adversarial robustness of GNNs. In this paper, we first present a novel gradient-based attack method that facilitates the difficulty of tackling discrete graph data. When comparing to current adversarial attacks on GNNs, the results show that by only perturbing a small number of edge perturbations, including addition and deletion, our optimization-based attack can lead to a noticeable decrease in classification performance. Moreover, leveraging our gradient-based attack, we propose the first optimization-based adversarial training for GNNs. Our method yields higher robustness against both different gradient based and greedy attack methods without sacrificing classification accuracy on original graph.}, bibtype = {article}, author = {Xu, Kaidi and Chen, Hongge and Liu, Sijia and Chen, Pin Yu and Weng, Tsui Wei and Hong, Mingyi and Lin, Xue}, doi = {10.24963/ijcai.2019/550}, journal = {IJCAI International Joint Conference on Artificial Intelligence} }
@article{ title = {Latent Adversarial Training of Graph Convolution Networks}, type = {article}, year = {2019}, id = {2aa84025-611b-3c18-b72b-ee4155d68507}, created = {2022-01-05T09:23:16.770Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:10.274Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Despite the recent success of graph convolution networks (GCNs) in modeling graph structured data, its vulnerability to adversarial attacks have been revealed and attacks on both node feature and graph structure have been designed. Direct extension of adversarial sample based defense algorithms meets with immediate challenge be- cause computing the adversarial network requires substantial cost. We propose addressing this issue by perturbing the latent representations in GCNs, which not only dispenses with adversarial network generation, but also attains improved robustness and accuracy by respecting the latent manifold of the data. Experimental results confirm the supe- rior performance over strong baselines.}, bibtype = {article}, author = {Jin, Hongwei and Zhang, Xinhua}, journal = {ICML 2019 Workshop: Learning and Reasoning with Graph-Structured Representations} }
@article{ title = {3D point cloud geometry compression on deep learning}, type = {article}, year = {2019}, keywords = {3D point cloud,Auto-encoder,Detail reconstruction,Geometry compression,Hierarchical structure}, pages = {890-898}, id = {b9fe0c82-81df-39bb-9be6-0c9c747572d5}, created = {2022-01-05T10:55:41.325Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.228Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Huang2019}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {3D point cloud presentation has been widely used in computer vision, automatic driving, augmented reality, smart cities and virtual reality. 3D point cloud compression method with higher compression ratio and tiny loss is the key to improve data transportation efficiency. In this paper, we propose a new 3D point cloud geometry compression method based on deep learning, also an auto-encoder performing better than other networks in detail reconstruction. It can reach much higher compression ratio than the state-of-art while keeping tolerable loss. It also supports parallel compressing multiple models by GPU, which can improve processing efficiency greatly. The compression process is composed of two parts. Firstly, Raw data is compressed into codeword by extracting feature of raw model with encoder. Then, the codeword is further compressed with sparse coding. Decompression process is implemented in reverse order. Codeword is recovered and fed into decoder to reconstruct point cloud. Detail reconstruction ability is improved by a hierarchical structure in our decoder. Latter outputs are grown from former fuzzier outputs. In this way, details are added to former output by latter layers step by step to make a more precise prediction. We compare our method with PCL compression and Draco compression on ShapeNet40 part dataset. Our method may be the first deep learning-based point cloud compression algorithm. The experiments demonstrate it is superior to former common compression algorithms with large compression ratio, which can also reserve original shapes with tiny loss.}, bibtype = {article}, author = {Huang, Tianxin and Liu, Yong}, doi = {10.1145/3343031.3351061}, journal = {MM 2019 - Proceedings of the 27th ACM International Conference on Multimedia} }
@article{ title = {PointAE: Point auto-encoder for 3D statistical shape and texture modelling}, type = {article}, year = {2019}, pages = {5409-5418}, volume = {2019-Octob}, id = {70a5072e-f22b-3ba7-a164-b5629c69938c}, created = {2022-01-05T11:42:39.179Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.162Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Dai2019}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {The outcome of standard statistical shape modelling is a vector space representation of objects. Any convex combination of vectors of a set of object class examples generates a real and valid example. In this paper, we propose a Point Auto-Encoder (PointAE) with skip-connection, attention blocks for 3D statistical shape modelling directly on 3D points. The proposed PointAE is able to refine the correspondence with a correspondence refinement block. The data with refined correspondence can be fed to the PointAE again and bootstrap the constructed statistical models. Instead of two seperate models, PointAE can simultaneously model the shape and texture variation. The extensive evaluation in three open-sourced datasets demonstrates that the proposed method achieves better performance in representation ability of the shape variations.}, bibtype = {article}, author = {Dai, Hang and Shao, Ling}, doi = {10.1109/ICCV.2019.00551}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Multi-angle point cloud-vae: Unsupervised feature learning for 3D point clouds from multiple angles by joint self-reconstruction and half-to-half prediction}, type = {article}, year = {2019}, pages = {10441-10450}, volume = {2019-Octob}, id = {33562a8f-2dff-3fae-8d16-753959ac0bfe}, created = {2022-01-05T11:42:39.179Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-08T11:04:44.358Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Unsupervised feature learning for point clouds has been vital for large-scale point cloud understanding. Recent deep learning based methods depend on learning global geometry from self-reconstruction. However, these methods are still suffering from ineffective learning of local geometry, which significantly limits the discriminability of learned features. To resolve this issue, we propose MAP-VAE to enable the learning of global and local geometry by jointly leveraging global and local self-supervision. To enable effective local self-supervision, we introduce multi-angle analysis for point clouds. In a multi-angle scenario, we first split a point cloud into a front half and a back half from each angle, and then, train MAP-VAE to learn to predict a back half sequence from the corresponding front half sequence. MAP-VAE performs this half-to-half prediction using RNN to simultaneously learn each local geometry and the spatial relationship among them. In addition, MAP-VAE also learns global geometry via self-reconstruction, where we employ a variational constraint to facilitate novel shape generation. The outperforming results in four shape analysis tasks show that MAP-VAE can learn more discriminative global or local features than the state-of-the-art methods.}, bibtype = {article}, author = {Han, Zhizhong and Wang, Xiyang and Liu, Yu Shen and Zwicker, Matthias}, doi = {10.1109/ICCV.2019.01054}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {NeuralSampler: Euclidean Point Cloud Auto-Encoder and Sampler}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1901.09394}, id = {7e55d0e0-0c3d-337f-ae71-f9ebe85afd84}, created = {2022-01-05T11:42:39.196Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.081Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Remelli2019}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {Most algorithms that rely on deep learning-based approaches to generate 3D point sets can only produce clouds containing fixed number of points. Furthermore, they typically require large networks parameterized by many weights, which makes them hard to train. In this paper, we propose an auto-encoder architecture that can both encode and decode clouds of arbitrary size and demonstrate its effectiveness at upsampling sparse point clouds. Interestingly, we can do so using less than half as many parameters as state-of-the-art architectures while still delivering better performance. We will make our code base fully available.}, bibtype = {article}, author = {Remelli, Edoardo and Baque, Pierre and Fua, Pascal} }
@article{ title = {Pointflow: 3D point cloud generation with continuous normalizing flows}, type = {article}, year = {2019}, pages = {4540-4549}, volume = {2019-Octob}, id = {6ce46512-7fac-3ceb-8a85-434b8e71377a}, created = {2022-01-05T11:42:42.779Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.012Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yang2019a}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {As 3D point clouds become the representation of choice for multiple vision and graphics applications, the ability to synthesize or reconstruct high-resolution, high-fidelity point clouds becomes crucial. Despite the recent success of deep learning models in discriminative tasks of point clouds, generating point clouds remains challenging. This paper proposes a principled probabilistic framework to generate 3D point clouds by modeling them as a distribution of distributions. Specifically, we learn a two-level hierarchy of distributions where the first level is the distribution of shapes and the second level is the distribution of points given a shape. This formulation allows us to both sample shapes and sample an arbitrary number of points from a shape. Our generative model, named PointFlow, learns each level of the distribution with a continuous normalizing flow. The invertibility of normalizing flows enables the computation of the likelihood during training and allows us to train our model in the variational inference framework. Empirically, we demonstrate that PointFlow achieves state-of-the-art performance in point cloud generation. We additionally show that our model can faithfully reconstruct point clouds and learn useful representations in an unsupervised manner. The code is available at https://github.com/stevenygd/PointFlow.}, bibtype = {article}, author = {Yang, Guandao and Huang, Xun and Hao, Zekun and Liu, Ming Yu and Belongie, Serge and Hariharan, Bharath}, doi = {10.1109/ICCV.2019.00464}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Stability of graph scattering transforms}, type = {article}, year = {2019}, volume = {32}, id = {ca2b479d-936d-3ba9-ad1e-c3b1fead038f}, created = {2022-01-14T16:04:12.081Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:37.733Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Scattering transforms are non-trainable deep convolutional architectures that exploit the multi-scale resolution of a wavelet filter bank to obtain an appropriate representation of data. More importantly, they are proven invariant to translations, and stable to perturbations that are close to translations. This stability property provides the scattering transform with a robustness to small changes in the metric domain of the data. When considering network data, regular convolutions do not hold since the data domain presents an irregular structure given by the network topology. In this work, we extend scattering transforms to network data by using multiresolution graph wavelets, whose computation can be obtained by means of graph convolutions. Furthermore, we prove that the resulting graph scattering transforms are stable to metric perturbations of the underlying network. This renders graph scattering transforms robust to changes on the network topology, making it particularly useful for cases of transfer learning, topology estimation or time-varying graphs.}, bibtype = {article}, author = {Gama, Fernando and Bruna, Joan and Ribeiro, Alejandro}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS} }
@article{ title = {Modelling Graph Errors: Towards Robust Graph Signal Processing}, type = {article}, year = {2019}, keywords = {erd}, pages = {1-34}, websites = {http://arxiv.org/abs/1903.08398}, id = {444211f1-bc0c-36fb-b845-6f3fed2591ab}, created = {2022-01-14T16:04:12.189Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:40.331Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The first step for any graph signal processing (GSP) procedure is to learn the graph signal representation, i.e., to capture the dependence structure of the data into an adjacency matrix. Indeed, the adjacency matrix is typically not known a priori and has to be learned. However, it is learned with errors. A little attention has been paid to modelling such errors in the adjacency matrix, and studying their effects on GSP methods. However, modelling errors in the adjacency matrix will enable both to study the graph error effects in GSP and to develop robust GSP algorithms. In this paper, we therefore introduce practically justifiable graph error models. We also study, both analytically when possible and numerically, the graph error effect on the performance of GSP methods in different types of problems such as filtering of graph signals and independent component analysis of graph signals (graph decorrelation).}, bibtype = {article}, author = {Miettinen, Jari and Vorobyov, Sergiy A. and Ollila, Esa}, number = {299243} }
@article{ title = {J Oint G Raph and F Eature L Earning I N G Raph}, type = {article}, year = {2019}, id = {b28c04da-7f35-3de4-89b1-1dabf2bff5b4}, created = {2022-01-14T16:04:12.341Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.272Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph Convolutional Neural Networks (GCNNs) extend classical CNNs to graph data domain, such as brain networks, social networks and 3D point clouds. It is critical to identify an appropriate graph for the subsequent graph convolution. Existing methods manually construct or learn one fixed graph for all the layers of a GCNN. In order to adapt to the underlying structure of node features in different layers, we propose dynamic learning of graphs and node features jointly in GCNNs. In particular, we cast the graph optimization problem as distance metric learning to capture pairwise similarities of features in each layer. We deploy the Mahalanobis distance metric and further decompose the metric matrix into a low-dimensional matrix, which converts graph learning to the optimization of a low-dimensional matrix for efficient implementation. Extensive experiments on point clouds and citation network datasets demonstrate the superiority of the proposed method in terms of both accuracies and robustness.}, bibtype = {article}, author = {Tang, Jiaxiang and Hu, Wei and Gao, Xiang and Guo, Zongming}, journal = {arXiv} }
@article{ title = {A Review of Deep Learning-Based Semantic Segmentation for Point Cloud}, type = {article}, year = {2019}, keywords = {3D point clouds,deep learning,feature fusion,graph convolutional neural network,semantic segmentation}, pages = {179118-179133}, volume = {7}, publisher = {IEEE}, id = {c9462b24-7d27-396e-84fc-1dfa83b4ba6c}, created = {2022-01-14T16:04:12.341Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.277Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In recent years, the popularity of depth sensors and 3D scanners has led to a rapid development of 3D point clouds. Semantic segmentation of point cloud, as a key step in understanding 3D scenes, has attracted extensive attention of researchers. Recent advances in this topic are dominantly led by deep learning-based methods. In this paper, we provide a survey covering various aspects ranging from indirect segmentation to direct segmentation. Firstly, we review methods of indirect segmentation based on multi-views and voxel grids, as well as direct segmentation methods from different perspectives including point ordering, multi-scale, feature fusion and fusion of graph convolutional neural network (GCNN). Then, the common datasets for point cloud segmentation are exposed to help researchers choose which one is the most suitable for their tasks. Following that, we devote a part of the paper to analyze the quantitative results of these methods. Finally, the development trend of point cloud semantic segmentation technology is prospected.}, bibtype = {article}, author = {Zhang, Jiaying and Zhao, Xiaoli and Chen, Zheng and Lu, Zhejun}, doi = {10.1109/ACCESS.2019.2958671}, journal = {IEEE Access} }
@article{ title = {SemanticKITTI}, type = {article}, year = {2019}, websites = {http://arxiv.org/abs/1904.01416}, id = {30e37df0-dd89-3fb4-bcbc-ee856b83c21e}, created = {2022-01-14T16:04:12.464Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.420Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Semantic scene understanding is important for various applications. In particular, self-driving cars need a fine-grained understanding of the surfaces and objects in their vicinity. Light detection and ranging (LiDAR) provides precise geometric information about the environment and is thus a part of the sensor suites of almost all self-driving cars. Despite the relevance of semantic scene understanding for this application, there is a lack of a large dataset for this task which is based on an automotive LiDAR. In this paper, we introduce a large dataset to propel research on laser-based semantic segmentation. We annotated all sequences of the KITTI Vision Odometry Benchmark and provide dense point-wise annotations for the complete 360-degree field-of-view of the employed automotive LiDAR. We propose three benchmark tasks based on this dataset: (i) semantic segmentation of point clouds using a single scan, (ii) semantic segmentation using multiple past scans, and (iii) semantic scene completion, which requires to anticipate the semantic scene in the future. We provide baseline experiments and show that there is a need for more sophisticated models to efficiently tackle these tasks. Our dataset opens the door for the development of more advanced methods, but also provides plentiful data to investigate new research directions.}, bibtype = {article}, author = {Behley, Jens and Garbade, Martin and Milioto, Andres and Quenzel, Jan and Behnke, Sven and Stachniss, Cyrill and Gall, Juergen}, journal = {Iccv}, number = {iii} }
@article{ title = {Optimized skeleton-based action recognition via sparsified graph regression}, type = {article}, year = {2019}, keywords = {Graph convolutional networks,Graph regression, graph convolutional networks, sp,Skeleton-based action recognition,Spatio-temporal graph modeling}, pages = {601-610}, id = {a445b342-7a8f-36b9-a7d8-5a6668c223a1}, created = {2022-01-14T16:04:12.471Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:56:59.443Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {With the prevalence of accessible depth sensors, dynamic human body skeletons have attracted much attention as a robust modality for action recognition. Previous methods model skeletons based on RNN or CNN, which has limited expressive power for irregular skeleton joints. While graph convolutional networks (GCN) have been proposed to address irregular graph-structured data, the fundamental graph construction remains challenging. In this paper, we represent skeletons naturally on graphs, and propose a graph regression based GCN (GR-GCN) for skeleton-based action recognition, aiming to capture the spatio-temporal variation in the data. As the graph representation is crucial to graph convolution, we first propose graph regression to statistically learn the underlying graph from multiple observations. In particular, we provide spatiotemporal modeling of skeletons and pose an optimization problem on the graph structure over consecutive frames, which enforces the sparsity of the underlying graph for efficient representation. The optimized graph not only connects each joint to its neighboring joints in the same frame strongly or weakly, but also links with relevant joints in the previous and subsequent frames. We then feed the optimized graph into the GCN along with the coordinates of the skeleton sequence for feature learning, where we deploy high-order and fast Chebyshev approximation of spectral graph convolution. Further, we provide analysis of the variation characterization by the Chebyshev approximation. Experimental results validate the effectiveness of the proposed graph regression and show that the proposed GR-GCN achieves the state-of-the-art performance on the widely used NTU RGB+D, UT-Kinect and SYSU 3D datasets.}, bibtype = {article}, author = {Gao, Xiang and Hu, Wei and Tang, Jiaxiang and Liu, Jiaying and Guo, Zongming}, doi = {10.1145/3343031.3351170}, journal = {MM 2019 - Proceedings of the 27th ACM International Conference on Multimedia} }
@article{ title = {Accurate lane detection with atrous convolution and spatial pyramid pooling for autonomous driving}, type = {article}, year = {2019}, pages = {642-647}, id = {16e8b382-c9bb-3ff8-8220-b1b7369aa196}, created = {2022-01-19T09:08:51.195Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.238Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Sun2019}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {Lane detection is a fundamental capability for autonomous driving. Many effective lane detection algorithms based on traditional computer vision and recent deep learning technologies have been proposed. However, the current state-of-the-art lane detection accuracy is still not satisfactory for realizing fully autonomous driving. Thus, this paper proposes a new lane detection network using atrous convolution and spatial pyramid pooling techniques to improve the lane detection accuracy. We address the detection problem with pixel-wise semantic segmentation. Our network consists of one encoder and two decoders, which outputs a binary segmentation map and an embedded feature map, respectively. The embedded feature map is employed for clustering algorithms to separate segmented lane pixels into different lanes. The experimental results on the public Tusimple dataset show that our network outperforms the state-of-the-arts.}, bibtype = {article}, author = {Sun, Yuxiang and Wang, Lujia and Chen, Yongquan and Liu, Ming}, doi = {10.1109/ROBIO49542.2019.8961705}, journal = {IEEE International Conference on Robotics and Biomimetics, ROBIO 2019}, number = {December} }
@article{ title = {Fast Graph Representation Learning with PyTorch Geometric}, type = {article}, year = {2019}, pages = {1-9}, websites = {http://arxiv.org/abs/1903.02428}, id = {6950ec41-6cc4-37fa-b335-e6caff7eb11c}, created = {2022-02-21T08:54:45.173Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-21T08:54:47.353Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, abstract = {We introduce PyTorch Geometric, a library for deep learning on irregularly structured input data such as graphs, point clouds and manifolds, built upon PyTorch. In addition to general graph data structures and processing methods, it contains a variety of recently published methods from the domains of relational learning and 3D data processing. PyTorch Geometric achieves high data throughput by leveraging sparse GPU acceleration, by providing dedicated CUDA kernels and by introducing efficient mini-batch handling for input examples of different size. In this work, we present the library in detail and perform a comprehensive comparative study of the implemented methods in homogeneous evaluation scenarios.}, bibtype = {article}, author = {Fey, Matthias and Lenssen, Jan Eric}, number = {1} }
@article{ title = {A Survey of Simple Geometric Primitives Detection Methods for Captured 3D Data}, type = {article}, year = {2019}, keywords = {3D data,I.3.5 [Computing Methodologies/Computer Graphics]:,computational geometry,data fitting,geometric primitives,shape abstraction,shape analysis,solid and object representations,surface}, pages = {167-196}, volume = {38}, id = {29ad0c6c-7545-3229-b627-6fc48fccfa7e}, created = {2022-02-24T07:10:46.972Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-28T07:56:34.417Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {The amount of captured 3D data is continuously increasing, with the democratization of consumer depth cameras, the development of modern multi-view stereo capture setups and the rise of single-view 3D capture based on machine learning. The analysis and representation of this ever growing volume of 3D data, often corrupted with acquisition noise and reconstruction artefacts, is a serious challenge at the frontier between computer graphics and computer vision. To that end, segmentation and optimization are crucial analysis components of the shape abstraction process, which can themselves be greatly simplified when performed on lightened geometric formats. In this survey, we review the algorithms which extract simple geometric primitives from raw dense 3D data. After giving an introduction to these techniques, from the acquisition modality to the underlying theoretical concepts, we propose an application-oriented characterization, designed to help select an appropriate method based on one's application needs and compare recent approaches. We conclude by giving hints for how to evaluate these methods and a set of research challenges to be explored.}, bibtype = {article}, author = {Kaiser, Adrien and Ybanez Zepeda, Jose Alonso and Boubekeur, Tamy}, doi = {10.1111/cgf.13451}, journal = {Computer Graphics Forum}, number = {1} }
@article{ title = {Emerging MPEG Standards for Point Cloud Compression}, type = {article}, year = {2019}, keywords = {3D data coding,Point cloud coding,immersive video coding}, pages = {133-148}, volume = {9}, id = {228ff4a8-150c-3c92-b819-9624de2e9593}, created = {2022-03-02T09:31:16.387Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T09:31:18.865Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {Due to the increased popularity of augmented and virtual reality experiences, the interest in capturing the real world in multiple dimensions and in presenting it to users in an immersible fashion has never been higher. Distributing such representations enables users to freely navigate in multi-sensory 3D media experiences. Unfortunately, such representations require a large amount of data, not feasible for transmission on today's networks. Efficient compression technologies well adopted in the content chain are in high demand and are key components to democratize augmented and virtual reality applications. Moving Picture Experts Group, as one of the main standardization groups dealing with multimedia, identified the trend and started recently the process of building an open standard for compactly representing 3D point clouds, which are the 3D equivalent of the very well-known 2D pixels. This paper introduces the main developments and technical aspects of this ongoing standardization effort.}, bibtype = {article}, author = {Schwarz, Sebastian and Preda, Marius and Baroncini, Vittorio and Budagavi, Madhukar and Cesar, Pablo and Chou, Philip A. and Cohen, Robert A. and Krivokuca, Maja and Lasserre, Sebastien and Li, Zhu and Llach, Joan and Mammou, Khaled and Mekuria, Rufael and Nakagami, Ohji and Siahaan, Ernestasia and Tabatabai, Ali and Tourapis, Alexis M. and Zakharchenko, Vladyslav}, doi = {10.1109/JETCAS.2018.2885981}, journal = {IEEE Journal on Emerging and Selected Topics in Circuits and Systems}, number = {1} }
@article{ title = {Point2Sequence: Learning the shape representation of 3D point clouds with an attention-based sequence to sequence network}, type = {article}, year = {2019}, pages = {8778-8785}, id = {f8283951-d3b9-320f-a825-8d0fe7351279}, created = {2022-03-18T06:58:19.066Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.322Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Liu2019}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Exploring contextual information in the local region is important for shape understanding and analysis. Existing studies often employ hand-crafted or explicit ways to encode contextual information of local regions. However, it is hard to capture fine-grained contextual information in hand-crafted or explicit manners, such as the correlation between different areas in a local region, which limits the discriminative ability of learned features. To resolve this issue, we propose a novel deep learning model for 3D point clouds, named Point2Sequence, to learn 3D shape features by capturing fine-grained contextual information in a novel implicit way. Point2Sequence employs a novel sequence learning model for point clouds to capture the correlations by aggregating multi-scale areas of each local region with attention. Specifically, Point2Sequence first learns the feature of each area scale in a local region. Then, it captures the correlation between area scales in the process of aggregating all area scales using a recurrent neural network (RNN) based encoder-decoder structure, where an attention mechanism is proposed to highlight the importance of different area scales. Experimental results show that Point2Sequence achieves state-of-the-art performance in shape classification and segmentation tasks.}, bibtype = {article}, author = {Liu, Xinhai and Han, Zhizhong and Liu, Yu Shen and Zwicker, Matthias}, doi = {10.1609/aaai.v33i01.33018778}, journal = {33rd AAAI Conference on Artificial Intelligence, AAAI 2019, 31st Innovative Applications of Artificial Intelligence Conference, IAAI 2019 and the 9th AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2019} }
@article{ title = {Deep point-based scene labeling with depth mapping and geometric patch feature encoding}, type = {article}, year = {2019}, keywords = {CNN,Deep learning,Patch,Scene understanding,Segmentation,Semantics labeling}, pages = {101033}, volume = {104}, websites = {https://doi.org/10.1016/j.gmod.2019.101033}, publisher = {Elsevier Inc.}, id = {72a9a923-9bd5-35d1-b5b3-66e3d52bf41d}, created = {2022-03-18T10:02:57.313Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-18T10:03:04.238Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {This paper presents a deep CNN approach for point-based semantic scene labeling. This is challenging because 3D point clouds do not have a canonical domain and can have complex geometry and substantial variation of sampling densities. We propose a novel framework where the convolution operator is defined on depth maps around sampled points, which captures characteristics of local surface regions. We introduce Depth Mapping (DM) and Reverse Depth Mapping (RDM) operators to transform between the point domain and the depth map domain. Our depth map based convolution is computationally efficient, robust to scene scales and sampling densities, and can capture rich surface characteristics. We further propose to augment each point with feature encoding of the local geometric patches resulted from multi-method through patch pooling network (PPN). The patch features provide complementary information and are fed into our classification network to achieve semantic segmentation.}, bibtype = {article}, author = {Cai, Jun Xiong and Mu, Tai Jiang and Lai, Yu Kun and Hu, Shi Min}, doi = {10.1016/j.gmod.2019.101033}, journal = {Graphical Models}, number = {February} }
@article{ title = {Diagnosing and enhancing VAE models}, type = {article}, year = {2019}, keywords = {deep generative model,variational autoencoder}, pages = {1-44}, id = {b2139d5d-4090-349a-bc75-88db0efdace7}, created = {2022-03-23T06:17:59.504Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.605Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Dai2019a}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Although variational autoencoders (VAEs) represent a widely influential deep generative model, many aspects of the underlying energy function remain poorly understood. In particular, it is commonly believed that Gaussian encoder/decoder assumptions reduce the effectiveness of VAEs in generating realistic samples. In this regard, we rigorously analyze the VAE objective, differentiating situations where this belief is and is not actually true. We then leverage the corresponding insights to develop a simple VAE enhancement that requires no additional hyperparameters or sensitive tuning. Quantitatively, this proposal produces crisp samples and stable FID scores that are actually competitive with a variety of GAN models, all while retaining desirable attributes of the original VAE architecture. The code for our model is available at https://github.com/daib13/TwoStageVAE.}, bibtype = {article}, author = {Dai, Bin and Wipf, David}, journal = {7th International Conference on Learning Representations, ICLR 2019} }
@article{ title = {Non-local Attention Optimized Deep Image Compression}, type = {article}, year = {2019}, keywords = {Computer Science - Computer Vision and Pattern Rec,Electrical Engineering and Systems Science - Imag}, websites = {http://arxiv.org/abs/1904.09757}, month = {4}, id = {e68f44a1-8079-371f-a165-b361934d026c}, created = {2022-03-28T09:45:00.851Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:00.567Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liuNonlocalAttentionOptimized2019}, source_type = {article}, notes = {arXiv: 1904.09757}, private_publication = {false}, abstract = {This paper proposes a novel Non-Local Attention Optimized Deep Image Compression (NLAIC) framework, which is built on top of the popular variational auto-encoder (VAE) structure. Our NLAIC framework embeds non-local operations in the encoders and decoders for both image and latent feature probability information (known as hyperprior) to capture both local and global correlations, and apply attention mechanism to generate masks that are used to weigh the features for the image and hyperprior, which implicitly adapt bit allocation for different features based on their importance. Furthermore, both hyperpriors and spatial-channel neighbors of the latent features are used to improve entropy coding. The proposed model outperforms the existing methods on Kodak dataset, including learned (e.g., Balle2019, Balle2018) and conventional (e.g., BPG, JPEG2000, JPEG) image compression methods, for both PSNR and MS-SSIM distortion metrics.}, bibtype = {article}, author = {Liu, Haojie and Chen, Tong and Guo, Peiyao and Shen, Qiu and Cao, Xun and Wang, Yao and Ma, Zhan}, journal = {arXiv:1904.09757 [cs, eess]} }
@article{ title = {Adversarial Autoencoders for Compact Representations of 3D Point Clouds}, type = {article}, year = {2019}, keywords = {Computer Science - Computer Vision and Pattern Re,Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1811.07605}, month = {5}, id = {aa6f45d1-b3bf-3cf4-87db-54dcc6b38f26}, created = {2022-03-28T09:45:01.254Z}, accessed = {2021-10-01}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:52.367Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zamorskiAdversarialAutoencodersCompact2019}, source_type = {article}, notes = {arXiv: 1811.07605}, private_publication = {false}, abstract = {Deep generative architectures provide a way to model not only images but also complex, 3-dimensional objects, such as point clouds. In this work, we present a novel method to obtain meaningful representations of 3D shapes that can be used for challenging tasks including 3D points generation, reconstruction, compression, and clustering. Contrary to existing methods for 3D point cloud generation that train separate decoupled models for representation learning and generation, our approach is the first end-to-end solution that allows to simultaneously learn a latent space of representation and generate 3D shape out of it. Moreover, our model is capable of learning meaningful compact binary descriptors with adversarial training conducted on a latent space. To achieve this goal, we extend a deep Adversarial Autoencoder model (AAE) to accept 3D input and create 3D output. Thanks to our end-to-end training regime, the resulting method called 3D Adversarial Autoencoder (3dAAE) obtains either binary or continuous latent space that covers a much wider portion of training data distribution. Finally, our quantitative evaluation shows that 3dAAE provides state-of-the-art results for 3D points clustering and 3D object retrieval.}, bibtype = {article}, author = {Zamorski, Maciej and Zięba, Maciej and Klukowski, Piotr and Nowak, Rafał and Kurach, Karol and Stokowiec, Wojciech and Trzciński, Tomasz}, journal = {arXiv:1811.07605 [cs, stat]} }
@inproceedings{ title = {A Variational Observation Model of 3D Object for Probabilistic Semantic SLAM}, type = {inproceedings}, year = {2019}, keywords = {Object oriented modeling,Semantics,Shape,Simultaneous localization and mapping,Solid modeling,Three-dimensional displays}, pages = {5866-5872}, month = {5}, id = {8c53d17c-86a7-3585-8ede-b0a6ba9b5f96}, created = {2022-03-28T09:45:01.341Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:46.005Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yuVariationalObservationModel2019}, source_type = {inproceedings}, notes = {ISSN: 2577-087X}, private_publication = {false}, abstract = {We present a Bayesian object observation model for complete probabilistic semantic SLAM. Recent studies on object detection and feature extraction have become important for scene understanding and 3D mapping. However, 3D shape of the object is too complex to formulate the probabilistic observation model; therefore, performing the Bayesian inference of the object-oriented features as well as their pose is less considered. Besides, when the robot equipped with an RGB mono camera only observes the projected single view of an object, a significant amount of the 3D shape information is abandoned. Due to these limitations, semantic SLAM and viewpoint-independent loop closure using volumetric 3D object shape is challenging. In order to enable the complete formulation of probabilistic semantic SLAM, we approximate the observation model of a 3D object with a tractable distribution. We also estimate the variational likelihood from the 2D image of the object to exploit its observed single view. In order to evaluate the proposed method, we perform pose and feature estimation, and demonstrate that the automatic loop closure works seamlessly without additional loop detector in various environments.}, bibtype = {inproceedings}, author = {Yu, H W and Moon, J Y and Lee, B H}, doi = {10.1109/ICRA.2019.8794111}, booktitle = {2019 International Conference on Robotics and Automation (ICRA)} }
@inproceedings{ title = {Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks}, type = {inproceedings}, year = {2019}, pages = {3744-3753}, websites = {https://proceedings.mlr.press/v97/lee19d.html}, month = {5}, publisher = {PMLR}, id = {fa7244f5-a18e-3164-8d83-903aebd6f5ba}, created = {2022-03-28T09:45:01.573Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:59.071Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {leeSetTransformerFramework2019}, source_type = {inproceedings}, short_title = {Set Transformer}, notes = {ISSN: 2640-3498}, private_publication = {false}, abstract = {Many machine learning tasks such as multiple instance learning, 3D shape recognition, and few-shot image classification are defined on sets of instances. Since solutions to such problems do not depend on the order of elements of the set, models used to address them should be permutation invariant. We present an attention-based neural network module, the Set Transformer, specifically designed to model interactions among elements in the input set. The model consists of an encoder and a decoder, both of which rely on attention mechanisms. In an effort to reduce computational complexity, we introduce an attention scheme inspired by inducing point methods from sparse Gaussian process literature. It reduces the computation time of self-attention from quadratic to linear in the number of elements in the set. We show that our model is theoretically attractive and we evaluate it on a range of tasks, demonstrating the state-of-the-art performance compared to recent methods for set-structured data.}, bibtype = {inproceedings}, author = {Lee, Juho and Lee, Yoonho and Kim, Jungtaek and Kosiorek, Adam and Choi, Seungjin and Teh, Yee Whye}, booktitle = {Proceedings of the 36th International Conference on Machine Learning} }
@inbook{ type = {inbook}, year = {2019}, pages = {630-638}, websites = {https://epubs.siam.org/doi/abs/10.1137/1.9781611975673.71}, month = {5}, publisher = {Society for Industrial and Applied Mathematics}, series = {Proceedings}, id = {107a7838-0ddc-3182-8785-c94915eab3ba}, created = {2022-03-28T09:45:01.916Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:58.962Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {caiMultiStageVariationalAutoEncoders2019}, source_type = {incollection}, private_publication = {false}, abstract = {This paper is concerned with the development, analysis, and numerical realization of a novel variational model for the regularization of inverse problems in imaging. The proposed model is inspired by the architecture of generative convolutional neural networks; it aims to generate the unknown from variables in a latent space via multilayer convolutions and nonlinear penalties, and penalizes an associated cost. In contrast to conventional neural-network-based approaches, however, the convolution kernels are learned directly from the measured data such that no training is required. The present work provides a mathematical analysis of the proposed model in a function space setting, including proofs for regularity and existence/stability of solutions, and convergence for vanishing noise. Moreover, in a discretized setting, a numerical algorithm for solving various types of inverse problems with the proposed model is derived. Numerical results are provided for applications in inpainting, denoising, deblurring under noise, superresolution, and JPEG decompression with multiple test images.}, bibtype = {inbook}, author = {Cai, Lei and Gao, Hongyang and Ji, Shuiwang}, doi = {10.1137/1.9781611975673.71}, chapter = {Multi-Stage Variational Auto-Encoders for Coarse-to-Fine Image Generation}, title = {Proceedings of the 2019 SIAM International Conference on Data Mining (SDM)} }
@article{ title = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, type = {article}, year = {2019}, keywords = {Computer Science - Computation and Language}, websites = {http://arxiv.org/abs/1810.04805}, month = {5}, id = {a0eb3207-90f4-357d-be04-93b4cc4481d4}, created = {2022-03-28T09:45:01.966Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:04.316Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {devlinBERTPretrainingDeep2019}, source_type = {article}, short_title = {BERT}, notes = {arXiv: 1810.04805}, private_publication = {false}, abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5\% (7.7\% point absolute improvement), MultiNLI accuracy to 86.7\% (4.6\% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).}, bibtype = {article}, author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, journal = {arXiv:1810.04805 [cs]} }
@article{ title = {Under canopy light detection and ranging-based autonomous navigation}, type = {article}, year = {2019}, keywords = {agriculture,perception,terrestrial robotics}, pages = {547-567}, volume = {36}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1002/rob.21852}, id = {11124d93-d2ac-3297-aa24-f973ac7f2073}, created = {2022-03-28T09:45:02.362Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:15.351Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {higutiCanopyLightDetection2019}, source_type = {article}, notes = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1002/rob.21852}, private_publication = {false}, abstract = {This paper describes a light detection and ranging (LiDAR)-based autonomous navigation system for an ultralightweight ground robot in agricultural fields. The system is designed for reliable navigation under cluttered canopies using only a 2D Hokuyo UTM-30LX LiDAR sensor as the single source for perception. Its purpose is to ensure that the robot can navigate through rows of crops without damaging the plants in narrow row-based and high-leaf-cover semistructured crop plantations, such as corn (Zea mays) and sorghum ( Sorghum bicolor). The key contribution of our work is a LiDAR-based navigation algorithm capable of rejecting outlying measurements in the point cloud due to plants in adjacent rows, low-hanging leaf cover or weeds. The algorithm addresses this challenge using a set of heuristics that are designed to filter out outlying measurements in a computationally efficient manner, and linear least squares are applied to estimate within-row distance using the filtered data. Moreover, a crucial step is the estimate validation, which is achieved through a heuristic that grades and validates the fitted row-lines based on current and previous information. The proposed LiDAR-based perception subsystem has been extensively tested in production/breeding corn and sorghum fields. In such variety of highly cluttered real field environments, the robot logged more than 6 km of autonomous run in straight rows. These results demonstrate highly promising advances to LiDAR-based navigation in realistic field environments for small under-canopy robots.}, bibtype = {article}, author = {Higuti, Vitor A H and Velasquez, Andres E B and Magalhaes, Daniel Varela and Becker, Marcelo and Chowdhary, Girish}, doi = {10.1002/rob.21852}, journal = {Journal of Field Robotics}, number = {3} }
@inproceedings{ title = {Cross-Atlas Convolution for Parameterization Invariant Learning on Textured Mesh Surface}, type = {inproceedings}, year = {2019}, pages = {6143-6152}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Li_Cross-Atlas_Convolution_for_Parameterization_Invariant_Learning_on_Textured_Mesh_Surface_CVPR_2019_paper.html}, id = {490d9dba-a164-3a2a-8990-792a7a001039}, created = {2022-03-28T09:45:02.419Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:49.379Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liCrossAtlasConvolutionParameterization2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Li, Shiwei and Luo, Zixin and Zhen, Mingmin and Yao, Yao and Shen, Tianwei and Fang, Tian and Quan, Long} }
@inproceedings{ title = {On the Continuity of Rotation Representations in Neural Networks}, type = {inproceedings}, year = {2019}, pages = {5745-5753}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Zhou_On_the_Continuity_of_Rotation_Representations_in_Neural_Networks_CVPR_2019_paper.html}, id = {1d85e74d-43f0-3087-9654-b8eb4994f4ba}, created = {2022-03-28T09:45:02.935Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:06.891Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zhouContinuityRotationRepresentations2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Zhou, Yi and Barnes, Connelly and Lu, Jingwan and Yang, Jimei and Li, Hao} }
@inproceedings{ title = {3D Point Cloud Generative Adversarial Network Based on Tree Structured Graph Convolutions}, type = {inproceedings}, year = {2019}, pages = {3859-3868}, websites = {https://openaccess.thecvf.com/content_ICCV_2019/html/Shu_3D_Point_Cloud_Generative_Adversarial_Network_Based_on_Tree_Structured_ICCV_2019_paper.html}, id = {12eb8258-8abd-3977-8aa3-4a4ee98591f7}, created = {2022-03-28T09:45:02.953Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:03.940Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {shu3DPointCloud2019}, source_type = {inproceedings}, folder_uuids = {ad97b399-756a-4674-becd-d6455b989a39}, private_publication = {false}, bibtype = {inproceedings}, author = {Shu, Dong Wook and Park, Sung Woo and Kwon, Junseok} }
@inproceedings{ title = {GANomaly: Semi-supervised Anomaly Detection via Adversarial Training}, type = {inproceedings}, year = {2019}, keywords = {Anomaly detection,Generative Adversarial Networks,Semi-supervised learning,X-ray security imagery}, pages = {622-637}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {56413bdf-f63b-36a0-9485-1d332eaa45bb}, created = {2022-03-28T09:45:03.027Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:11.912Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {akcayGANomalySemisupervisedAnomaly2019}, source_type = {inproceedings}, short_title = {GANomaly}, private_publication = {false}, abstract = {Anomaly detection is a classical problem in computer vision, namely the determination of the normal from the abnormal when datasets are highly biased towards one class (normal) due to the insufficient sample size of the other class (abnormal). While this can be addressed as a supervised learning problem, a significantly more challenging problem is that of detecting the unknown/unseen anomaly case that takes us instead into the space of a one-class, semi-supervised learning paradigm. We introduce such a novel anomaly detection model, by using a conditional generative adversarial network that jointly learns the generation of high-dimensional image space and the inference of latent space. Employing encoder-decoder-encoder sub-networks in the generator network enables the model to map the input image to a lower dimension vector, which is then used to reconstruct the generated output image. The use of the additional encoder network maps this generated image to its latent representation. Minimizing the distance between these images and the latent vectors during training aids in learning the data distribution for the normal samples. As a result, a larger distance metric from this learned data distribution at inference time is indicative of an outlier from that distribution—an anomaly. Experimentation over several benchmark datasets, from varying domains, shows the model efficacy and superiority over previous state-of-the-art approaches.}, bibtype = {inproceedings}, author = {Akcay, Samet and Atapour-Abarghouei, Amir and Breckon, Toby P}, editor = {Jawahar, C V and Li, Hongdong and Mori, Greg and Schindler, Konrad}, doi = {10.1007/978-3-030-20893-6_39}, booktitle = {Computer Vision – ACCV 2018} }
@inproceedings{ title = {Pluralistic Image Completion}, type = {inproceedings}, year = {2019}, pages = {1438-1447}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Zheng_Pluralistic_Image_Completion_CVPR_2019_paper.html}, id = {2d8b0685-f4ff-3d97-ba5d-a2991926d8b3}, created = {2022-03-28T09:45:03.072Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:50.133Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zhengPluralisticImageCompletion2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Zheng, Chuanxia and Cham, Tat-Jen and Cai, Jianfei} }
@article{ title = {VV-Net: Voxel VAE Net with Group Convolutions for Point Cloud Segmentation}, type = {article}, year = {2019}, keywords = {Computer Science - Graphics}, websites = {http://arxiv.org/abs/1811.04337}, month = {8}, id = {5e3d0003-ab66-3c06-9453-149c82ee4ab6}, created = {2022-03-28T09:45:03.153Z}, accessed = {2021-09-17}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:37.837Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {mengVVNetVoxelVAE2019}, source_type = {article}, short_title = {VV-Net}, notes = {arXiv: 1811.04337}, private_publication = {false}, abstract = {We present a novel algorithm for point cloud segmentation. Our approach transforms unstructured point clouds into regular voxel grids, and further uses a kernel-based interpolated variational autoencoder (VAE) architecture to encode the local geometry within each voxel. Traditionally, the voxel representation only comprises Boolean occupancy information which fails to capture the sparsely distributed points within voxels in a compact manner. In order to handle sparse distributions of points, we further employ radial basis functions (RBF) to compute a local, continuous representation within each voxel. Our approach results in a good volumetric representation that effectively tackles noisy point cloud datasets and is more robust for learning. Moreover, we further introduce group equivariant CNN to 3D, by defining the convolution operator on a symmetry group acting on \$\textbackslashmathbb\Z\\textasciicircum3\$ and its isomorphic sets. This improves the expressive capacity without increasing parameters, leading to more robust segmentation results. We highlight the performance on standard benchmarks and show that our approach outperforms state-of-the-art segmentation algorithms on the ShapeNet and S3DIS datasets.}, bibtype = {article}, author = {Meng, Hsien-Yu and Gao, Lin and Lai, YuKun and Manocha, Dinesh}, journal = {arXiv:1811.04337 [cs]} }
@inproceedings{ title = {AMASS: Archive of Motion Capture As Surface Shapes}, type = {inproceedings}, year = {2019}, pages = {5442-5451}, websites = {https://openaccess.thecvf.com/content_ICCV_2019/html/Mahmood_AMASS_Archive_of_Motion_Capture_As_Surface_Shapes_ICCV_2019_paper.html}, id = {a0782166-e39e-3e79-b1c0-bcba3ff9c2f3}, created = {2022-03-28T09:45:03.468Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:43.161Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {mahmoodAMASSArchiveMotion2019}, source_type = {inproceedings}, short_title = {AMASS}, private_publication = {false}, bibtype = {inproceedings}, author = {Mahmood, Naureen and Ghorbani, Nima and Troje, Nikolaus F and Pons-Moll, Gerard and Black, Michael J} }
@inproceedings{ title = {Point-To-Pose Voting Based Hand Pose Estimation Using Residual Permutation Equivariant Layer}, type = {inproceedings}, year = {2019}, pages = {11927-11936}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Li_Point-To-Pose_Voting_Based_Hand_Pose_Estimation_Using_Residual_Permutation_Equivariant_CVPR_2019_paper.html}, id = {cbd7e092-aa60-3f1f-a447-4d5d28a4be75}, created = {2022-03-28T09:45:03.483Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:39.644Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liPointToPoseVotingBased2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Li, Shile and Lee, Dongheui} }
@inproceedings{ title = {Zero-shot Learning via Simultaneous Generating and Learning}, type = {inproceedings}, year = {2019}, volume = {32}, websites = {https://proceedings.neurips.cc/paper/2019/hash/19ca14e7ea6328a42e0eb13d585e4c22-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {23bc3f0d-ee5d-327f-bf20-39120e853346}, created = {2022-03-28T09:45:04.046Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:12.068Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yuZeroshotLearningSimultaneous2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Yu, Hyeonwoo and Lee, Beomhee}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {Discriminative regularization of the latent manifold of variational auto-encoders}, type = {article}, year = {2019}, pages = {121-129}, volume = {61}, websites = {https://linkinghub.elsevier.com/retrieve/pii/S1047320319301026}, month = {5}, id = {e806081c-f1be-39df-bfd8-184ce8e32b4c}, created = {2022-03-28T09:45:04.348Z}, accessed = {2021-10-19}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:28.833Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kossykDiscriminativeRegularizationLatent2019}, source_type = {article}, private_publication = {false}, abstract = {We present an approach on training classifiers or regressors using the latent embedding of variational auto-encoders (VAE), an unsupervised deep learning method, as features. Usually VAEs are trained using unlabeled data and independently from the classifier, whereas we investigate and analyze the performance of a classifier or regressor that is trained jointly with the variational deep network. We found that models trained this way can improve the embedding s.t. to increase classification performance, and also can be used for semi-supervised learning, building up the information extracting latent representation in an incremental fashion.}, bibtype = {article}, author = {Kossyk, Ingo and Márton, Zoltán-Csaba}, doi = {10.1016/j.jvcir.2019.03.008}, journal = {Journal of Visual Communication and Image Representation} }
@inproceedings{ title = {Learning Time-Series Data of Industrial Design Optimization using Recurrent Neural Networks}, type = {inproceedings}, year = {2019}, keywords = {Data models,Optimization,Recurrent neural networks,Shape,Solid modeling,Task analysis,Three-dimensional displays,computer aided engineering,deep learning,optimization}, pages = {785-792}, month = {11}, id = {9012e534-7586-388b-a98a-017898d735b0}, created = {2022-03-28T09:45:04.411Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:09.472Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sahaLearningTimeSeriesData2019}, source_type = {inproceedings}, notes = {ISSN: 2375-9259}, private_publication = {false}, abstract = {In automotive digital development, 3D shape morphing techniques are used to create new designs in order to match design targets, such as aerodynamic or stylistic requirements. Control-point based shape morphing alters existing geometries either through human user interactions or through computational optimization algorithms that optimize for product performance targets. Shape morphing is typically continuous and results in potentially large data sets of time-series recordings of control point movements. In the present paper, we utilize recurrent neural networks to model such time-series recordings in order to predict future design steps based on the history of currently performed design modifications. To build a data set sufficiently large for the training of neural networks, we use target shape matching optimization as digital analogy for a human user interactive shape modification and to build data sets of control point movements in an automated fashion. Experiments show the potential of recurrent neural networks to successfully learn time-series data representing design changes and to perform single-and multi-step prediction of potential next design steps. We thus demonstrate the feasibility of recurrent neural networks for learning successful design sequences in order to predict promising next design steps in future design tasks.}, bibtype = {inproceedings}, author = {Saha, Sneha and Rios, Thiago and Menzel, Stefan and Sendhoff, Bernhard and Bäck, Thomas and Yao, Xin and Xu, Zhao and Wollstadt, Patricia}, doi = {10.1109/ICDMW.2019.00116}, booktitle = {2019 International Conference on Data Mining Workshops (ICDMW)} }
@inbook{ type = {inbook}, year = {2019}, pages = {581-592}, month = {1}, id = {42c40b65-281e-3fb3-9fcb-203dcbddb448}, created = {2022-03-28T09:45:04.483Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:04.483Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {friedrichPotentialChallengesNeural2019}, source_type = {incollection}, private_publication = {false}, abstract = {In the field of two-dimensional image and video processing, convolutional neural networks have been successfully applied to generate novel images by composing content and style of two different sources, a process called artistic or neural style transfer. However a usage of these methods for three-dimensional objects is not straightforward due to the unstructured mesh representations of typical shape data. Hence efficient geometry representations are required to use neural network based style transfer concepts for three-dimensional shapes and to enable the fast creation of style options for instance in a product ideation process. In this paper an overview of current state-of-the-art shape representations is presented with respect to their applicability of neural style transfer on three-dimensional shape data. Combinations of three-dimensional geometric representations with deep neural network architectures are evaluated towards their capability to store and reproduce content and style information based on previously proposed reconstruction tests.}, bibtype = {inbook}, author = {Friedrich, Timo and Aulig, Nikola and Menzel, Stefan}, doi = {10.1007/978-3-319-97773-7_52}, chapter = {On the Potential and Challenges of Neural Style Transfer for Three-Dimensional Shape Data} }
@inproceedings{ title = {Deep Set Prediction Networks}, type = {inproceedings}, year = {2019}, volume = {32}, websites = {https://proceedings.neurips.cc/paper/2019/hash/6e79ed05baec2754e25b4eac73a332d2-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {0f8ffd11-6931-3abe-9a29-696cb19d3b20}, created = {2022-03-28T09:45:04.561Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:35.437Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zhangDeepSetPrediction2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Zhang, Yan and Hare, Jonathon and Prugel-Bennett, Adam}, booktitle = {Advances in Neural Information Processing Systems} }
@inproceedings{ title = {PartNet: A Large-Scale Benchmark for Fine-Grained and Hierarchical Part-Level 3D Object Understanding}, type = {inproceedings}, year = {2019}, pages = {909-918}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Mo_PartNet_A_Large-Scale_Benchmark_for_Fine-Grained_and_Hierarchical_Part-Level_3D_CVPR_2019_paper.html}, id = {1ca2ffb5-692f-3eab-b574-206582b3e3bc}, created = {2022-03-28T09:45:04.561Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:50.568Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {moPartNetLargeScaleBenchmark2019}, source_type = {inproceedings}, short_title = {PartNet}, private_publication = {false}, bibtype = {inproceedings}, author = {Mo, Kaichun and Zhu, Shilin and Chang, Angel X and Yi, Li and Tripathi, Subarna and Guibas, Leonidas J and Su, Hao} }
@inproceedings{ title = {Superquadrics Revisited: Learning 3D Shape Parsing Beyond Cuboids}, type = {inproceedings}, year = {2019}, pages = {10344-10353}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Paschalidou_Superquadrics_Revisited_Learning_3D_Shape_Parsing_Beyond_Cuboids_CVPR_2019_paper.html}, id = {5639e314-748c-38d8-8012-286ab154fc31}, created = {2022-03-28T09:45:04.825Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:04.848Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {paschalidouSuperquadricsRevisitedLearning2019}, source_type = {inproceedings}, short_title = {Superquadrics Revisited}, private_publication = {false}, bibtype = {inproceedings}, author = {Paschalidou, Despoina and Ulusoy, Ali Osman and Geiger, Andreas} }
@inproceedings{ title = {Learning Trajectory Dependencies for Human Motion Prediction}, type = {inproceedings}, year = {2019}, pages = {9489-9497}, websites = {https://openaccess.thecvf.com/content_ICCV_2019/html/Mao_Learning_Trajectory_Dependencies_for_Human_Motion_Prediction_ICCV_2019_paper.html}, id = {149845fd-854f-3a96-acba-fa71ee68bfed}, created = {2022-03-28T09:45:04.861Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:57.093Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {maoLearningTrajectoryDependencies2019}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Mao, Wei and Liu, Miaomiao and Salzmann, Mathieu and Li, Hongdong} }
@article{ title = {Tracking Error Learning Control for Precise Mobile Robot Path Tracking in Outdoor Environment}, type = {article}, year = {2019}, pages = {975-986}, volume = {95}, websites = {https://doi.org/10.1007/s10846-018-0916-3}, month = {9}, id = {6c26cbe2-176e-31e3-8f5e-638820e2d7c2}, created = {2022-03-28T09:45:05.128Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:01.229Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kayacanTrackingErrorLearning2019}, source_type = {article}, private_publication = {false}, abstract = {This paper presents a Tracking-Error Learning Control (TELC) algorithm for precise mobile robot path tracking in off-road terrain. In traditional tracking error-based control approaches, feedback and feedforward controllers are designed based on the nominal model which cannot capture the uncertainties, disturbances and changing working conditions so that they cannot ensure precise path tracking performance in the outdoor environment. In TELC algorithm, the feedforward control actions are updated by using the tracking error dynamics and the plant-model mismatch problem is thus discarded. Therefore, the feedforward controller gradually eliminates the feedback controller from the control of the system once the mobile robot has been on-track. In addition to the proof of the stability, it is proven that the cost functions do not have local minima so that the coefficients in TELC algorithm guarantee that the global minimum is reached. The experimental results show that the TELC algorithm results in better path tracking performance than the traditional tracking error-based control method. The mobile robot controlled by TELC algorithm can track a target path precisely with less than 10 cm error in off-road terrain.}, bibtype = {article}, author = {Kayacan, Erkan and Chowdhary, Girish}, doi = {10.1007/s10846-018-0916-3}, journal = {Journal of Intelligent \& Robotic Systems}, number = {3} }
@inproceedings{ title = {Multi-Angle Point Cloud-VAE: Unsupervised Feature Learning for 3D Point Clouds From Multiple Angles by Joint Self-Reconstruction and Half-to-Half Prediction}, type = {inproceedings}, year = {2019}, keywords = {Computer vision,Conferences}, pages = {10441-10450}, month = {10}, id = {14b4e810-cb27-3624-97be-5b3561110099}, created = {2022-03-28T09:45:05.284Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:21:58.256Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {hanMultiAnglePointCloudVAE2019}, source_type = {inproceedings}, short_title = {Multi-Angle Point Cloud-VAE}, notes = {ISSN: 2380-7504}, private_publication = {false}, abstract = {Unsupervised feature learning for point clouds has been vital for large-scale point cloud understanding. Recent deep learning based methods depend on learning global geometry from self-reconstruction. However, these methods are still suffering from ineffective learning of local geometry, which significantly limits the discriminability of learned features. To resolve this issue, we propose MAP-VAE to enable the learning of global and local geometry by jointly leveraging global and local self-supervision. To enable effective local self-supervision, we introduce multi-angle analysis for point clouds. In a multi-angle scenario, we first split a point cloud into a front half and a back half from each angle, and then, train MAP-VAE to learn to predict a back half sequence from the corresponding front half sequence. MAP-VAE performs this half-to-half prediction using RNN to simultaneously learn each local geometry and the spatial relationship among them. In addition, MAP-VAE also learns global geometry via self-reconstruction, where we employ a variational constraint to facilitate novel shape generation. The outperforming results in four shape analysis tasks show that MAP-VAE can learn more discriminative global or local features than the state-of-the-art methods.}, bibtype = {inproceedings}, author = {Han, Zhizhong and Wang, Xiyang and Liu, Yu-Shen and Zwicker, Matthias}, doi = {10.1109/ICCV.2019.01054}, booktitle = {2019 IEEE/CVF International Conference on Computer Vision (ICCV)} }
@article{ title = {Deep Multimodal Representation Learning: A Survey}, type = {article}, year = {2019}, keywords = {Data mining,Decoding,Deep learning,Feature extraction,Multimodal representation learning,Semantics,Speech recognition,Task analysis,deep multimodal fusion,multimodal adversarial learning,multimodal deep learning,multimodal translation}, pages = {63373-63394}, volume = {7}, id = {1c241040-4745-3274-bd30-69d97839ad51}, created = {2022-03-28T09:45:05.363Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:22.222Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {guoDeepMultimodalRepresentation2019}, source_type = {article}, short_title = {Deep Multimodal Representation Learning}, notes = {Conference Name: IEEE Access}, private_publication = {false}, abstract = {Multimodal representation learning, which aims to narrow the heterogeneity gap among different modalities, plays an indispensable role in the utilization of ubiquitous multimodal data. Due to the powerful representation ability with multiple levels of abstraction, deep learning-based multimodal representation learning has attracted much attention in recent years. In this paper, we provided a comprehensive survey on deep multimodal representation learning which has never been concentrated entirely. To facilitate the discussion on how the heterogeneity gap is narrowed, according to the underlying structures in which different modalities are integrated, we category deep multimodal representation learning methods into three frameworks: joint representation, coordinated representation, and encoder-decoder. Additionally, we review some typical models in this area ranging from conventional models to newly developed technologies. This paper highlights on the key issues of newly developed technologies, such as encoder-decoder model, generative adversarial networks, and attention mechanism in a multimodal representation learning perspective, which, to the best of our knowledge, have never been reviewed previously, even though they have become the major focuses of much contemporary research. For each framework or model, we discuss its basic structure, learning objective, application scenes, key issues, advantages, and disadvantages, such that both novel and experienced researchers can benefit from this survey. Finally, we suggest some important directions for future work.}, bibtype = {article}, author = {Guo, Wenzhong and Wang, Jianwen and Wang, Shiping}, doi = {10.1109/ACCESS.2019.2916887}, journal = {IEEE Access} }
@inproceedings{ title = {3D MRI Brain Tumor Segmentation Using Autoencoder Regularization}, type = {inproceedings}, year = {2019}, pages = {311-320}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {647f590e-f935-3cc8-8721-ad37b1191b22}, created = {2022-03-28T09:45:05.757Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:53.356Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {myronenko3DMRIBrain2019}, source_type = {inproceedings}, private_publication = {false}, abstract = {Automated segmentation of brain tumors from 3D magnetic resonance images (MRIs) is necessary for the diagnosis, monitoring, and treatment planning of the disease. Manual delineation practices require anatomical knowledge, are expensive, time consuming and can be inaccurate due to human error. Here, we describe a semantic segmentation network for tumor subregion segmentation from 3D MRIs based on encoder-decoder architecture. Due to a limited training dataset size, a variational auto-encoder branch is added to reconstruct the input image itself in order to regularize the shared decoder and impose additional constraints on its layers. The current approach won 1st place in the BraTS 2018 challenge.}, bibtype = {inproceedings}, author = {Myronenko, Andriy}, editor = {Crimi, Alessandro and Bakas, Spyridon and Kuijf, Hugo and Keyvan, Farahani and Reyes, Mauricio and van Walsum, Theo}, doi = {10.1007/978-3-030-11726-9_28}, booktitle = {Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries} }
@inproceedings{ title = {PointFlow: 3D Point Cloud Generation With Continuous Normalizing Flows}, type = {inproceedings}, year = {2019}, pages = {4541-4550}, websites = {https://openaccess.thecvf.com/content_ICCV_2019/html/Yang_PointFlow_3D_Point_Cloud_Generation_With_Continuous_Normalizing_Flows_ICCV_2019_paper.html}, id = {e7635ce9-ef68-3a5a-8ea9-671f600f518b}, created = {2022-03-28T09:45:05.982Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:28.328Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yangPointFlow3DPoint2019}, source_type = {inproceedings}, short_title = {PointFlow}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,ad97b399-756a-4674-becd-d6455b989a39}, private_publication = {false}, bibtype = {inproceedings}, author = {Yang, Guandao and Huang, Xun and Hao, Zekun and Liu, Ming-Yu and Belongie, Serge and Hariharan, Bharath} }
@inproceedings{ title = {PartNet: A Recursive Part Decomposition Network for Fine-Grained and Hierarchical Shape Segmentation}, type = {inproceedings}, year = {2019}, pages = {9491-9500}, websites = {https://openaccess.thecvf.com/content_CVPR_2019/html/Yu_PartNet_A_Recursive_Part_Decomposition_Network_for_Fine-Grained_and_Hierarchical_CVPR_2019_paper.html}, id = {cd21d890-2271-328a-ac10-98fd67f66082}, created = {2022-03-28T09:45:06.532Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:25.192Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yuPartNetRecursivePart2019}, source_type = {inproceedings}, short_title = {PartNet}, private_publication = {false}, bibtype = {inproceedings}, author = {Yu, Fenggen and Liu, Kun and Zhang, Yan and Zhu, Chenyang and Xu, Kai} }
@inproceedings{ title = {On the Efficiency of a Point Cloud Autoencoder as a Geometric Representation for Shape Optimization}, type = {inproceedings}, year = {2019}, keywords = {Computational modeling,Geometry,Optimization,Shape,Strain,Task analysis,Three-dimensional displays,evolutionary design optimization,free form deformation,geometric representation,point cloud autoencoder}, pages = {791-798}, month = {12}, id = {b593b3e1-5fa2-3fba-9b74-549503036aa1}, created = {2022-03-28T09:45:06.621Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:29.650Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {riosEfficiencyPointCloud2019}, source_type = {inproceedings}, private_publication = {false}, abstract = {A crucial step for optimizing a system is to formulate the objective function, and part of it concerns the selection of the design parameters. One of the major goals is to achieve a fair trade-off between exploring feasible solutions in the design space and maintaining admissible computational effort. In order to achieve such balance in optimization problems with Computer Aided Engineering (CAE) models, the conventional constructive geometric representations are substituted by deformation methods, e.g. free form deformation, where the position of a few control points might be capable of handling large scale shape modifications. In light of the recent developments in the field of geometric deep learning, autoencoders have risen as a promising alternative for efficiently condensing high-dimensional models into compact representations. In this paper, we present a novel perspective on geometric deep learning models by exploring the applicability of the latent space of a point cloud autoencoder in shape optimization problems with evolutionary algorithms. Focusing on engineering applications, a target shape matching optimization is used as a surrogate to the computationally expensive CAE simulations required in engineering optimizations. Through the quality assessment of the solutions achieved in the optimization and further aspects, such as shape feasibility, point cloud autoencoders showed to be consistent and suitable geometric representations for such problems, adding a new perspective on the approaches for handling high-dimensional models to optimization tasks.}, bibtype = {inproceedings}, author = {Rios, Thiago and Sendhoff, Bernhard and Menzel, Stefan and Bäck, Thomas and van Stein, Bas}, doi = {10.1109/SSCI44817.2019.9003161}, booktitle = {2019 IEEE Symposium Series on Computational Intelligence (SSCI)} }
@article{ title = {StructureNet: Hierarchical Graph Networks for 3D Shape Generation}, type = {article}, year = {2019}, keywords = {Computer Science - Computational Geometry,Computer Science - Computer Vision and Pattern Re,Computer Science - Graphics}, websites = {http://arxiv.org/abs/1908.00575}, month = {8}, id = {f8f28247-b538-3bd4-9037-d02987770c62}, created = {2022-03-28T09:45:06.665Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:50.168Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {moStructureNetHierarchicalGraph2019}, source_type = {article}, short_title = {StructureNet}, notes = {arXiv: 1908.00575}, private_publication = {false}, abstract = {The ability to generate novel, diverse, and realistic 3D shapes along with associated part semantics and structure is central to many applications requiring high-quality 3D assets or large volumes of realistic training data. A key challenge towards this goal is how to accommodate diverse shape variations, including both continuous deformations of parts as well as structural or discrete alterations which add to, remove from, or modify the shape constituents and compositional structure. Such object structure can typically be organized into a hierarchy of constituent object parts and relationships, represented as a hierarchy of n-ary graphs. We introduce StructureNet, a hierarchical graph network which (i) can directly encode shapes represented as such n-ary graphs; (ii) can be robustly trained on large and complex shape families; and (iii) can be used to generate a great diversity of realistic structured shape geometries. Technically, we accomplish this by drawing inspiration from recent advances in graph neural networks to propose an order-invariant encoding of n-ary graphs, considering jointly both part geometry and inter-part relations during network training. We extensively evaluate the quality of the learned latent spaces for various shape families and show significant advantages over baseline and competing methods. The learned latent spaces enable several structure-aware geometry processing applications, including shape generation and interpolation, shape editing, or shape structure discovery directly from un-annotated images, point clouds, or partial scans.}, bibtype = {article}, author = {Mo, Kaichun and Guerrero, Paul and Yi, Li and Su, Hao and Wonka, Peter and Mitra, Niloy and Guibas, Leonidas J}, journal = {arXiv:1908.00575 [cs]} }
@inproceedings{ title = {Scalability of Learning Tasks on 3D CAE Models Using Point Cloud Autoencoders}, type = {inproceedings}, year = {2019}, keywords = {Computational modeling,Computer architecture,Machine learning,Shape,Task analysis,Three-dimensional displays,Training,computer aided engineering,deep learning,dimensionality reduction,geometry,sampling methods}, pages = {1367-1374}, month = {12}, id = {a1bf24ab-cddb-37d9-8172-79745f1b83ea}, created = {2022-03-28T09:45:06.802Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:06.735Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {riosScalabilityLearningTasks2019}, source_type = {inproceedings}, private_publication = {false}, abstract = {Geometric Deep Learning (GDL) methods have recently gained interest as powerful, high-dimensional models for approaching various geometry processing tasks. However, training deep neural network models on geometric input requires considerable computational effort. Even more so, if one considers typical problem sizes found in application domains such as engineering tasks, where geometric data are often orders of magnitude larger than the inputs currently considered in GDL literature. Hence, an assessment of the scalability of the training task is necessary, where model and data set parameters can be mapped to the computational demand during training. The present paper therefore studies the effects of data set size and the number of free model parameters on the computational effort of training a Point Cloud Autoencoder (PC-AE). We further review pre-processing techniques to obtain efficient representations of high-dimensional inputs to the PC-AE and investigate the effects of these techniques on the information abstracted by the trained model. We perform these experiments on synthetic geometric data inspired by engineering applications using computing hardware with particularly recent graphics processing units (GPUs) with high memory specifications. The present study thus provides a comprehensive evaluation of how to scale geometric deep learning architectures to high-dimensional inputs to allow for an application of state-of-the-art deep learning methods in real-world tasks.}, bibtype = {inproceedings}, author = {Rios, Thiago and Wollstadt, Patricia and Stein, Bas van and Bäck, Thomas and Xu, Zhao and Sendhoff, Bernhard and Menzel, Stefan}, doi = {10.1109/SSCI44817.2019.9002982}, booktitle = {2019 IEEE Symposium Series on Computational Intelligence (SSCI)} }
@article{ title = {A comprehensive survey on impulse and Gaussian denoising filters for digital images}, type = {article}, year = {2019}, keywords = {Denoising filters,Gaussian noise,Image processing,Impulse noise}, pages = {236-260}, volume = {157}, websites = {https://doi.org/10.1016/j.sigpro.2018.12.006}, publisher = {Elsevier B.V.}, id = {602a9379-9493-3196-a227-234dbf93f919}, created = {2022-04-05T05:35:07.966Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.802Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {This review article provides a comprehensive survey on state-of-the-art impulse and Gaussian denoising filters applied to images and summarizes the progress that has been made over the years in all applications involving image processing. The random noise model in this survey is assumed to be comprised of impulse (salt and pepper) and Gaussian noise. Different noise models are addressed, and different types of denoising filters are studied in terms of their performance on digital images and in their various practical implications and domains of application. A comprehensive comparison is performed to cover all the denoising methods in details and the results they yield. With this extensive review, researchers in image processing will be able to ascertain which of these denoising methods will be best applicable to their research needs and the application domain where such methods are contemplated for implementation.}, bibtype = {article}, author = {Mafi, Mehdi and Martin, Harold and Cabrerizo, Mercedes and Andrian, Jean and Barreto, Armando and Adjouadi, Malek}, doi = {10.1016/j.sigpro.2018.12.006}, journal = {Signal Processing} }
@article{ title = {Inductive t-SNE via deep learning to visualize multi-label images}, type = {article}, year = {2019}, keywords = {Dimensionality reduction,Multi-label images,Partial relevance}, pages = {336-345}, volume = {81}, websites = {https://doi.org/10.1016/j.engappai.2019.01.015}, publisher = {Elsevier Ltd}, id = {2f228cc7-33b5-37ef-a658-bb66df3e6438}, created = {2022-04-05T05:35:08.048Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:35.092Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This work presents a methodology for dimensionality reduction of images with multiple occurrences of multiple objects, such that they can be placed on a 2-dimensional plane under the constrain that nearby images are similar in terms of visual content and semantics. The first part of this methodology adds inductive capabilities to the well known t-SNE method used for visualization, thus making possible its generalization for unseen data, as opposed to previous extensions with only transductive capabilities. This is achieved by pairing the base t-SNE with a Deep Neural Network. The second part exploits semantic information to perform supervised dimensionality reduction, which results in better separability of the low-dimensional space, this is, it separates better images with no relevance, while retaining the proximity of those images with partial relevance. Since dealing with images having multiple occurrences of multiple objects requires the consideration of partial relevance, additionally we present a definition of partial relevance for the evaluation of classification and retrieval scenarios on images, or other documents, that share contents, at least partially.}, bibtype = {article}, author = {Roman-Rangel, Edgar and Marchand-Maillet, Stephane}, doi = {10.1016/j.engappai.2019.01.015}, journal = {Engineering Applications of Artificial Intelligence}, number = {March} }
@article{ title = {Process monitoring using variational autoencoder for high-dimensional nonlinear processes}, type = {article}, year = {2019}, keywords = {High-dimensional process,Multivariate control chart,Nonlinear process,Statistical process monitoring,Variational autoencoder}, pages = {13-27}, volume = {83}, websites = {https://doi.org/10.1016/j.engappai.2019.04.013}, publisher = {Elsevier Ltd}, id = {4d5dbb66-19e2-385a-b231-14741e2efe35}, created = {2022-04-05T05:35:08.233Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:25.215Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In many industries, statistical process monitoring techniques play a key role in improving processes through variation reduction and defect prevention. Modern large-scale industrial processes require appropriate monitoring techniques that can efficiently address high-dimensional nonlinear processes. Such processes have been successfully monitored with several latent variable-based methods. However, because these monitoring methods use Hotelling's T2 statistics in the reduced space, a normality assumption underlies the construction of these tools. This assumption has limited the use of latent variable-based monitoring charts in both nonlinear and nonnormal situations. In this study, we propose a variational autoencoder (VAE) as a monitoring method that can address both nonlinear and nonnormal situations in high-dimensional processes. VAE is appropriate for T2 charts because it causes the reduced space to follow a multivariate normal distribution. The effectiveness and applicability of the proposed VAE-based chart were demonstrated through experiments on simulated data and real data from a thin-film-transistor liquid-crystal display process.}, bibtype = {article}, author = {Lee, Seulki and Kwak, Mingu and Tsui, Kwok Leung and Kim, Seoung Bum}, doi = {10.1016/j.engappai.2019.04.013}, journal = {Engineering Applications of Artificial Intelligence}, number = {May} }
@article{ title = {A review of state-of-the-art techniques for abnormal human activity recognition}, type = {article}, year = {2019}, keywords = {Ambient Assistive Living,Crowd anomaly,Skeleton based fall detection,Three-dimensional anomaly detection,Two-dimensional anomaly detection}, pages = {21-45}, volume = {77}, websites = {https://doi.org/10.1016/j.engappai.2018.08.014}, publisher = {Elsevier Ltd}, id = {defe1b86-dd3b-326b-8543-4f0204e49205}, created = {2022-04-07T06:10:54.325Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:11:07.048Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {The concept of intelligent visual identification of abnormal human activity has raised the standards of surveillance systems, situation cognizance, homeland safety and smart environments. However, abnormal human activity is highly diverse in itself due to the aspects such as (a) the fundamental definition of anomaly (b) feature representation of an anomaly, (c) its application, and henceforth (d) the dataset. This paper aims to summarize various existing abnormal human activity recognition (AbHAR) handcrafted and deep approaches with the variation of the type of information available such as two-dimensional or three-dimensional data. Features play a vital role in an excellent performance of an AbHAR system. The proposed literature provides feature designs of abnormal human activity recognition in a video with respect to the context or application such as fall detection, Ambient Assistive Living (AAL), homeland security, surveillance or crowd analysis using RGB, depth and skeletal evidence. The key contributions and limitations of every feature design technique, under each category: 2D and 3D AbHAR, in respective contexts are tabulated that will provide insight of various abnormal action detection approaches. Finally, the paper outlines newly added datasets for AbHAR by the researchers with added complexities for method validations.}, bibtype = {article}, author = {Dhiman, Chhavi and Vishwakarma, Dinesh Kumar}, doi = {10.1016/j.engappai.2018.08.014}, journal = {Engineering Applications of Artificial Intelligence}, number = {July 2017} }
@article{ title = {Industry 4.0: A bibliometric analysis and detailed overview}, type = {article}, year = {2019}, keywords = {Bibliometric study,Cyber–physical systems (CPS),Industry 4.0,Internet of Things (IoT),SCOPUS,Survey,Web of science}, pages = {218-235}, volume = {78}, websites = {https://doi.org/10.1016/j.engappai.2018.11.007}, publisher = {Elsevier Ltd}, id = {3a9555c6-fa03-33b2-b619-13c4abd96fa5}, created = {2022-04-07T06:10:54.371Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:11:02.403Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d44c1c58-0149-4360-9eaf-8e2a9b657b50}, private_publication = {false}, abstract = {With the arrival of Industry 4.0, the overall transformation using digital integration and intelligent engineering has taken a giant leap towards futuristic technology. All devices today are equipped with machine learning, automation has become a priority and thus another industrial revolution is in the making. In this state-of-the-art paper, we have performed bibliometric analysis and an extensive survey on recent developments in the field of “Industry 4.0”. In bibliometric analysis, different performance metrics are extracted, such as: total papers, total citations, and citation per paper. Further, top 10 of the most productive and highly cited authors, major subject areas, sources or journals, countries, and institutions are evaluated. A list of highly influential papers is also assessed. Later on, a detailed discussion of the most cited papers is analysed and a sectional classification is provided. This paper summarizes the growth structure of Industry 4.0 during the last 5 years and provides the concise background overview of Industry 4.0 related works and various application areas.}, bibtype = {article}, author = {Muhuri, Pranab K. and Shukla, Amit K. and Abraham, Ajith}, doi = {10.1016/j.engappai.2018.11.007}, journal = {Engineering Applications of Artificial Intelligence}, number = {September 2018} }
@article{ title = {Srinet: Learning strictly rotation-invariant representations for point cloud classification and segmentation}, type = {article}, year = {2019}, keywords = {3D shape analysis,Point cloud,Rotation invariance}, pages = {980-988}, id = {354df1de-f439-3e14-8e53-e1c5aa7c8b7a}, created = {2022-07-28T12:39:24.534Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:32.880Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {Point cloud analysis has drawn broader attentions due to its increasing demands in various fields. Despite the impressive performance has been achieved on several databases, researchers neglect the fact that the orientation of those point cloud data is aligned. Varying the orientation of point cloud may lead to the degradation of performance, restricting the capacity of generalizing to real applications where the pri- or of orientation is often unknown. In this paper, we propose the point projection feature, which is invariant to the rotation of the input point cloud. A novel architecture is designed to mine features of different levels. We adopt a PointNet-based backbone to extract global feature for point cloud, and the graph aggregation operation to perceive local shape structure. Besides, we introduce an efficient key point descriptor to assign each point with different response and help recognize the overall geometry. Mathematical analyses and experimental results demonstrate that the proposed method can extract strictly rotation-invariant representations for point cloud recognition and segmentation without data augmentation, and outperforms other state-of-the-art methods.}, bibtype = {article}, author = {Sun, Xiao and Lian, Zhouhui and Xiao, Jianguo}, doi = {10.1145/3343031.3351042}, journal = {MM 2019 - Proceedings of the 27th ACM International Conference on Multimedia} }
@article{ title = {Rotation Invariant Convolutions for 3D Point Clouds Deep Learning}, type = {article}, year = {2019}, keywords = {3D Point Clouds,Convolution,Deep Learning,Rotation Invariant}, pages = {204-213}, publisher = {IEEE}, id = {5be30ca3-3d12-3d1c-bcda-1c7b7b041a62}, created = {2022-07-28T12:39:24.661Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:38.615Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {Recent progresses in 3D deep learning has shown that it is possible to design special convolution operators to consume point cloud data. However, a typical drawback is that rotation invariance is often not guaranteed, resulting in networks that generalizes poorly to arbitrary rotations. In this paper, we introduce a novel convolution operator for point clouds that achieves rotation invariance. Our core idea is to use low-level rotation invariant geometric features such as distances and angles to design a convolution operator for point cloud learning. The well-known point ordering problem is also addressed by a binning approach seamlessly built into the convolution. This convolution operator then serves as the basic building block of a neural network that is robust to point clouds under 6-DoF transformations such as translation and rotation. Our experiment shows that our method performs with high accuracy in common scene understanding tasks such as object classification and segmentation. Compared to previous and concurrent works, most importantly, our method is able to generalize and achieve consistent results across different scenarios in which training and testing can contain arbitrary rotations. Our implementation is publicly available at our project page.}, bibtype = {article}, author = {Zhang, Zhiyuan and Hua, Binh Son and Rosen, David W. and Yeung, Sai Kit}, doi = {10.1109/3DV.2019.00031}, journal = {Proceedings - 2019 International Conference on 3D Vision, 3DV 2019} }
@article{ title = {Generating 3D Adversarial Point Clouds}, type = {article}, year = {2019}, pages = {9136-9144}, id = {31c19296-71d1-3807-8b18-cd7eeb8e2e52}, created = {2022-09-01T14:14:15.401Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:07.883Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {cc70381c-f813-438b-bf40-50646c28fe23,db36ed60-3b58-424a-b9a4-a9c7322975f3}, private_publication = {false}, bibtype = {article}, author = {Qi, Charles R and Li, Bo} }
@article{ title = {Adversarial Attack and Defense on Point Sets}, type = {article}, year = {2019}, pages = {1-17}, id = {bdd0273a-aff3-348a-9759-a6feaab1a8f3}, created = {2022-09-01T14:14:15.545Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T11:25:08.056Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {cc70381c-f813-438b-bf40-50646c28fe23,db36ed60-3b58-424a-b9a4-a9c7322975f3}, private_publication = {false}, bibtype = {article}, author = {Yang, Jiancheng and Zhang, Qiang and Fang, Rongyao and Ni, Bingbing and Liu, Jinxian and Tian, Qi} }
@article{ title = {3D Point Capsule Networks Supplementary Material}, type = {article}, year = {2019}, pages = {2-6}, id = {403ca4b3-533c-397b-a10b-4f6e3f78b316}, created = {2022-09-08T17:25:32.343Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:28.313Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,ad97b399-756a-4674-becd-d6455b989a39}, private_publication = {false}, abstract = {In this paper, we propose 3D point-capsule networks, an auto-encoder designed to process sparse 3D point clouds while preserving spatial arrangements of the input data. 3D capsule networks arise as a direct consequence of our unified formulation of the common 3D auto-encoders. The dynamic routing scheme and the peculiar 2D latent space deployed by our capsule networks bring in improvements for several common point cloud-related tasks, such as object classification, object reconstruction and part segmentation as substantiated by our extensive evaluations. Moreover, it enables new applications such as part interpolation and replacement.}, bibtype = {article}, author = {Zhao, Yongheng and Deng, Haowen and Tombari, Federico and Universit, Technische and Ag, Siemens}, journal = {Cvpr} }
@article{ title = {PU-GAN: A point cloud upsampling adversarial network}, type = {article}, year = {2019}, pages = {7202-7211}, volume = {2019-Octob}, id = {923e6c65-4075-3a35-886a-1a44fc1e155c}, created = {2022-09-08T17:25:32.351Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:28.313Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,ad97b399-756a-4674-becd-d6455b989a39}, private_publication = {false}, abstract = {Point clouds acquired from range scans are often sparse, noisy, and non-uniform. This paper presents a new point cloud upsampling network called PU-GAN, which is formulated based on a generative adversarial network (GAN), to learn a rich variety of point distributions from the latent space and upsample points over patches on object surfaces. To realize a working GAN network, we construct an up-down-up expansion unit in the generator for upsampling point features with error feedback and self-correction, and formulate a self-attention unit to enhance the feature integration. Further, we design a compound loss with adversarial, uniform and reconstruction terms, to encourage the discriminator to learn more latent patterns and enhance the output point distribution uniformity. Qualitative and quantitative evaluations demonstrate the quality of our results over the state-of-the-arts in terms of distribution uniformity, proximity-to-surface, and 3D reconstruction quality.}, bibtype = {article}, author = {Li, Ruihui and Li, Xianzhi and Fu, Chi Wing and Cohen-Or, Daniel and Heng, Pheng Ann}, doi = {10.1109/ICCV.2019.00730}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {c} }
@article{ title = {Topnet: Structural point cloud decoder}, type = {article}, year = {2019}, keywords = {3D from Multiview and Sensors,3D from Single Image,Deep Learning,Physics-based Vision and Shape-from-X,RGBD sen}, pages = {383-392}, volume = {2019-June}, id = {4846720a-9cc1-3136-a7da-880480866871}, created = {2022-09-08T17:25:32.352Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-21T09:29:25.595Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,ad97b399-756a-4674-becd-d6455b989a39}, private_publication = {false}, abstract = {3D point cloud generation is of great use for 3D scene modeling and understanding. Real-world 3D object point clouds can be properly described by a collection of low-level and high-level structures such as surfaces, geometric primitives, semantic parts,etc. In fact, there exist many different representations of a 3D object point cloud as a set of point groups. Existing frameworks for point cloud genera-ion either do not consider structure in their proposed solutions, or assume and enforce a specific structure/topology,e.g. a collection of manifolds or surfaces, for the generated point cloud of a 3D object. In this work, we pro-pose a novel decoder that generates a structured point cloud without assuming any specific structure or topology on the underlying point set. Our decoder is softly constrained to generate a point cloud following a hierarchical rooted tree structure. We show that given enough capacity and allowing for redundancies, the proposed decoder is very flexible and able to learn any arbitrary grouping of points including any topology on the point set. We evaluate our decoder on the task of point cloud generation for 3D point cloud shape completion. Combined with encoders from existing frameworks, we show that our proposed decoder significantly outperforms state-of-the-art 3D point cloud completion methods on the Shapenet dataset.}, bibtype = {article}, author = {Tchapmi, Lyne P. and Kosaraju, Vineet and Rezatofighi, Hamid and Reid, Ian and Savarese, Silvio}, doi = {10.1109/CVPR.2019.00047}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Learning implicit fields for generative shape modeling}, type = {article}, year = {2019}, keywords = {3D from Single Image,Deep Learning,Vision + Graphics}, pages = {5932-5941}, volume = {2019-June}, id = {beff576e-3cf5-3952-bfb3-9a5a9ff8df90}, created = {2022-10-03T13:31:09.952Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.261Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {We advocate the use of implicit fields for learning generative models of shapes and introduce an implicit field decoder, called IM-NET, for shape generation, aimed at improving the visual quality of the generated shapes. An implicit field assigns a value to each point in 3D space, so that a shape can be extracted as an iso-surface. IM-NET is trained to perform this assignment by means of a binary classifier. Specifically, it takes a point coordinate, along with a feature vector encoding a shape, and outputs a value which indicates whether the point is outside the shape or not. By replacing conventional decoders by our implicit decoder for representation learning (via IM-AE) and shape generation (via IM-GAN), we demonstrate superior results for tasks such as generative shape modeling, interpolation, and single-view 3D reconstruction, particularly in terms of visual quality. Code and supplementary material are available at https://github.com/czq142857/implicit-decoder.}, bibtype = {article}, author = {Chen, Zhiqin and Zhang, Hao}, doi = {10.1109/CVPR.2019.00609}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {3D scene reconstruction with multi-layer depth and epipolar transformers}, type = {article}, year = {2019}, pages = {2172-2182}, volume = {2019-Octob}, id = {cfcc3cd9-4ab7-3be8-99a6-0cf8df324cbe}, created = {2022-10-03T13:31:09.956Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:25.291Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {We tackle the problem of automatically reconstructing a complete 3D model of a scene from a single RGB image. This challenging task requires inferring the shape of both visible and occluded surfaces. Our approach utilizes viewer-centered, multi-layer representation of scene geometry adapted from recent methods for single object shape completion. To improve the accuracy of view-centered representations for complex scenes, we introduce a novel 'Epipolar Feature Transformer' that transfers convolutional network features from an input view to other virtual camera viewpoints, and thus better covers the 3D scene geometry. Unlike existing approaches that first detect and localize objects in 3D, and then infer object shape using category-specific models, our approach is fully convolutional, end-to-end differentiable, and avoids the resolution and memory limitations of voxel representations. We demonstrate the advantages of multi-layer depth representations and epipolar feature transformers on the reconstruction of a large database of indoor scenes.}, bibtype = {article}, author = {Shin, Daeyun and Ren, Zhile and Sudderth, Erik and Fowlkes, Charless}, doi = {10.1109/ICCV.2019.00226}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {What do single-view 3D reconstruction networks learn?}, type = {article}, year = {2019}, keywords = {3D from Single Image,Deep Learning}, pages = {3400-3409}, volume = {2019-June}, id = {65eb83e1-cc0a-3cbd-bfa0-b71076b8a927}, created = {2022-10-03T13:31:10.063Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:26.365Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {Convolutional networks for single-view object reconstruction have shown impressive performance and have become a popular subject of research. All existing techniques are united by the idea of having an encoder-decoder network that performs non-trivial reasoning about the 3D structure of the output space. In this work, we set up two alternative approaches that perform image classification and retrieval respectively. These simple baselines yield better results than state-of-the-art methods, both qualitatively and quantitatively. We show that encoder-decoder methods are statistically indistinguishable from these baselines, thus indicating that the current state of the art in single-view object reconstruction does not actually perform reconstruction but image classification. We identify aspects of popular experimental procedures that elicit this behavior and discuss ways to improve the current state of research.}, bibtype = {article}, author = {Tatarchenko, Maxim and Richter, Stephan R. and Ranftl, Rene and Li, Zhuwen and Koltun, Vladlen and Brox, Thomas}, doi = {10.1109/CVPR.2019.00352}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {MVPNet: Multi-view point regression networks for 3D object reconstruction from a single image}, type = {article}, year = {2019}, pages = {8949-8956}, id = {40a59999-5e6c-3747-9915-90cd93ddd7f3}, created = {2022-10-03T13:31:10.081Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.585Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {In this paper, we address the problem of reconstructing an object's surface from a single image using generative networks. First, we represent a 3D surface with an aggregation of dense point clouds from multiple views. Each point cloud is embedded in a regular 2D grid aligned on an image plane of a viewpoint, making the point cloud convolution-favored and ordered so as to fit into deep network architectures. The point clouds can be easily triangulated by exploiting connectivities of the 2D grids to form mesh-based surfaces. Second, we propose an encoder-decoder network that generates such kind of multiple view-dependent point clouds from a single image by regressing their 3D coordinates and visibilities. We also introduce a novel geometric loss that is able to interpret discrepancy over 3D surfaces as opposed to 2D projective planes, resorting to the surface discretization on the constructed meshes. We demonstrate that the multi-view point regression network outperforms state-of-the-art methods with a significant improvement on challenging datasets.}, bibtype = {article}, author = {Wang, Jinglu and Sun, Bo and Lu, Yan}, doi = {10.1609/aaai.v33i01.33018949}, journal = {33rd AAAI Conference on Artificial Intelligence, AAAI 2019, 31st Innovative Applications of Artificial Intelligence Conference, IAAI 2019 and the 9th AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2019} }
@article{ title = {Effective Rotation-Invariant Point CNN with Spherical Harmonics Kernels}, type = {article}, year = {2019}, keywords = {Segmentation,Shape analysis,Shape recognition}, pages = {47-56}, publisher = {IEEE}, id = {dc88db3d-0c12-3819-a184-ee9699616c90}, created = {2023-04-24T07:38:01.473Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:55.782Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Poulenard2019}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {We present a novel rotation invariant architecture operating directly on point cloud data. We demonstrate how rotation invariance can be injected into a recently proposed point-based PCNN architecture, on all layers of the network. This leads to invariance to both global shape transformations, and to local rotations on the level of patches or parts, useful when dealing with non-rigid objects. We achieve this by employing a spherical harmonics-based kernel at different layers of the network, which is guaranteed to be invariant to rigid motions. We also introduce a more efficient pooling operation for PCNN using space-partitioning data-structures. This results in a flexible, simple and efficient architecture that achieves accurate results on challenging shape analysis tasks, including classification and segmentation, without requiring data-augmentation typically employed by non-invariant approaches. Code and data are provided on the project page https://github.com/adrienPoulenard/SPHnet.}, bibtype = {article}, author = {Poulenard, Adrien and Rakotosaona, Marie Julie and Ponty, Yann and Ovsjanikov, Maks}, doi = {10.1109/3DV.2019.00015}, journal = {Proceedings - 2019 International Conference on 3D Vision, 3DV 2019} }
@article{ title = {Effective Rotation-Invariant Point CNN with Spherical Harmonics Kernels}, type = {article}, year = {2019}, keywords = {Segmentation,Shape analysis,Shape recognition}, pages = {47-56}, publisher = {IEEE}, id = {ff30b97b-eba8-36ef-aa0b-49b54c18bcbd}, created = {2023-05-03T12:06:03.348Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.647Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Poulenard2019}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present a novel rotation invariant architecture operating directly on point cloud data. We demonstrate how rotation invariance can be injected into a recently proposed point-based PCNN architecture, on all layers of the network. This leads to invariance to both global shape transformations, and to local rotations on the level of patches or parts, useful when dealing with non-rigid objects. We achieve this by employing a spherical harmonics-based kernel at different layers of the network, which is guaranteed to be invariant to rigid motions. We also introduce a more efficient pooling operation for PCNN using space-partitioning data-structures. This results in a flexible, simple and efficient architecture that achieves accurate results on challenging shape analysis tasks, including classification and segmentation, without requiring data-augmentation typically employed by non-invariant approaches. Code and data are provided on the project page https://github.com/adrienPoulenard/SPHnet.}, bibtype = {article}, author = {Poulenard, Adrien and Rakotosaona, Marie Julie and Ponty, Yann and Ovsjanikov, Maks}, doi = {10.1109/3DV.2019.00015}, journal = {Proceedings - 2019 International Conference on 3D Vision, 3DV 2019} }
@article{ title = {Registration of point cloud data for matching crushed sand particles}, type = {article}, year = {2019}, keywords = {Fracture surface,Mesh segmentation,Particle crushing,Particle tracking,Point cloud registration}, pages = {227-242}, volume = {347}, websites = {https://doi.org/10.1016/j.powtec.2019.03.001}, publisher = {Elsevier B.V.}, id = {76ce1385-5198-398c-8cde-41b67b9369ee}, created = {2023-05-03T13:16:40.382Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.962Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wu2019}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Identifying the fracture surface of fragments that are resulted from the mechanical crushing of a single sand particle and matching individual fragments to the mother particle remains a challenge in the geomechanics research. This paper presents a novel particle tracking method that can track fractured fragments from the original sand particle through the application of the point cloud registration technique based on the X-ray tomography data. Firstly, the external morphologies of the mother particle and the child particles were acquired from the X-ray tomography data from the single particle crushing test. A series of imaging process techniques were applied to treat the raw images and obtain the particle fracture pattern. Secondly, after the image processing, the contour-based mesh segmentation algorithm was utilised to divide a child particle into multiple faces. Then the iterative closest point algorithm was used to try matching each face of the child particle to the mother particle and evaluate the matching results through the distance error. Finally, upon a successful match, the region with a higher point density on the child particle can be extracted and marked as a part of the original particle surface, and the rest part marked as a part of the fracture surface. The effectiveness and efficiency of the tracking method were demonstrated using the tomography data of 9 crushed Leighton Buzzard sand particles.}, bibtype = {article}, author = {Wu, Mengmeng and Wang, Jianfeng}, doi = {10.1016/j.powtec.2019.03.001}, journal = {Powder Technology} }
@article{ title = {A Novel Recognition Algorithm in 3D Point Clouds based for on Local Spherical Harmonics}, type = {article}, year = {2019}, keywords = {3D point clouds,Local Spherical harmonics,Peg-in-Hole,Recognition}, pages = {1041-1046}, publisher = {IEEE}, id = {61a18161-d669-34c7-8017-52b36db460ec}, created = {2023-05-03T13:16:41.047Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.972Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Hui2019}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This paper presents a novel recognition algorithm of a 3D object in point clouds based on Local Spherical harmonics. In the proposed algorithm, the 3D point cloud of an object is decomposed into a set of local fields which constitute an orthogonal basis of expansion coefficients by Spherical Harmonic Expansion. The similarity between any corresponding local fields from two objects is expressed by a Euclidean distance between their expansion coefficients. The proposed algorithm aims to, provide a method to solve the problem of incomplete point cloud recognition. Our algorithm outperforms the existing approaches including Iterative Closest Point (ICP) and Discriminant Shape Primitives (DSP) with a recognition rate of 95.1% on the extension of Princeton Shape Benchmark and it has achieved a recognition rate of 92.9% on the extension of UWA Data-set.}, bibtype = {article}, author = {Hui, Cao and Wang, Riwei and Wen, Xianbin and Zhao, Jindong and Chen, Wei and Zhang, Xuping}, doi = {10.1109/ICMA.2019.8816499}, journal = {Proceedings of 2019 IEEE International Conference on Mechatronics and Automation, ICMA 2019} }
@inproceedings{ title = {Why Compete When You Can Work Together: FPGA-ASIC Integration for Persistent RNNs}, type = {inproceedings}, year = {2019}, keywords = {Deep learning,FPGA,chiplet,persistent AI,system in package}, pages = {199-207}, month = {4}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {1}, id = {06707b30-6dd0-3275-be45-f99e32cb1194}, created = {2023-11-07T10:04:37.505Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:16:22.107Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Interactive intelligent services, such as smart web search, are important datacenter workloads. They rely on dataintensive deep learning (DL) algorithms with strict latency constraints and thus require balancing both data movement and compute capabilities. As such, a persistent approach that keeps the entire DL model on-chip is becoming the new norm for realtime services to avoid the expensive off-chip memory accesses. This approach is adopted in Microsoft's Brainwave and is also provided by Nvidia's cuDNN libraries. This paper presents a comparative study of FPGA, GPU, and FPGA+ASIC in-package solutions for persistent DL. Unlike prior work, we offer a fair and direct comparison targeting common numerical precisions (FP32, INT8) and modern high-end FPGA (Intel® Stratix®10), GPU (Nvidia Volta), and ASIC (10 nm process), all using the persistent approach. We show that Stratix 10 FPGAs offer 2.7× (FP32) to 8.6× (INT8) lower latency than Volta GPUs across RNN, GRU, and LSTM workloads from DeepBench. The GPU can only utilize ~6% of its peak TOPS, while the FPGA with a more balanced on-chip memory and compute can achieve much higher utilization (~57%). We also study integrating an ASIC chiplet, TensorRAM, with an FPGA as system-in-package to enhance on-chip memory capacity and bandwidth, and provide compute throughput matching the required bandwidth. We show that a small 32 mm2 TensorRAM 10nm chiplet can offer 64 MB memory, 32 TB/s on-chiplet bandwidth, and 64 TOPS (INT8). A small Stratix 10 FPGA with a TensorRAM (INT8) offers 15.9× better latency than GPU (FP32) and 34× higher energy efficiency. It has 2× aggregate on-chip memory capacity compared to a large FPGA or GPU. Overall, our study shows that the FPGA is better than the GPU for persistent DL, and when integrated with an ASIC chiplet, it can offer a more compelling solution.}, bibtype = {inproceedings}, author = {Nurvitadhi, Eriko and Kwon, Dongup and Jafari, Ali and Boutros, Andrew and Sim, Jaewoong and Tomson, Phillip and Sumbul, Huseyin and Chen, Gregory and Knag, Phil and Kumar, Raghavan and Krishnamurthy, Ram and Gribok, Sergey and Pasca, Bogdan and Langhammer, Martin and Marr, Debbie and Dasu, Aravind}, doi = {10.1109/FCCM.2019.00035}, booktitle = {Proceedings - 27th IEEE International Symposium on Field-Programmable Custom Computing Machines, FCCM 2019} }
@article{ title = {Accurate Measurements with Off-the-Shelf Range Cameras}, type = {article}, year = {2018}, pages = {139}, id = {9939252f-b908-31fb-a3c0-93562f415043}, created = {2020-09-14T08:14:53.676Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-09-14T08:20:18.766Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, bibtype = {article}, author = {Fakult, Der Technischen and Peter, F} }
@article{ title = {PCPNet learning local shape properties from raw point clouds}, type = {article}, year = {2018}, pages = {75-85}, volume = {37}, id = {1c572b66-454e-3b88-b1a2-26abfb5decee}, created = {2020-09-14T08:14:53.904Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T12:41:40.092Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,2a0475f2-facb-4360-917f-00c5f8541f47,8d18e62e-6e66-4acb-ae6a-b470435041d8}, private_publication = {false}, abstract = {In this paper, we propose PCPNET, a deep-learning based approach for estimating local 3D shape properties in point clouds. In contrast to the majority of prior techniques that concentrate on global or mid-level attributes, e.g., for shape classification or semantic labeling, we suggest a patch-based learning method, in which a series of local patches at multiple scales around each point is encoded in a structured manner. Our approach is especially well-adapted for estimating local shape properties such as normals (both unoriented and oriented) and curvature from raw point clouds in the presence of strong noise and multi-scale features. Our main contributions include both a novel multi-scale variant of the recently proposed PointNet architecture with emphasis on local shape information, and a series of novel applications in which we demonstrate how learning from training data arising from well-structured triangle meshes, and applying the trained model to noisy point clouds can produce superior results compared to specialized state-of-the-art techniques. Finally, we demonstrate the utility of our approach in the context of shape reconstruction, by showing how it can be used to extract normal orientation information from point clouds.}, bibtype = {article}, author = {Guerrero, Paul and Kleiman, Yanir and Ovsjanikov, Maks and Mitra, Niloy J.}, doi = {10.1111/cgf.13343}, journal = {Computer Graphics Forum}, number = {2} }
@article{ title = {Deep End-to-End Time-of-Flight Imaging (Supplemental Material)}, type = {article}, year = {2018}, id = {a84843f2-13ac-36e8-88a0-0e08ed9c6ba5}, created = {2020-09-14T10:49:26.364Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-09-24T06:08:47.508Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {Introduction In this supplemental document, we provide additional full-size depth reconstruction results and implementation details of the datasets generation and network training, ensuring full reproducibility of the proposed approach.}, bibtype = {article}, author = {Su, Shuochen and Heide, Ubc Felix and Wetzstein, Gordon and Heidrich Kaust, Wolfgang} }
@article{ title = {Tackling 3D ToF Artifacts Through Learning and the FLAT Dataset}, type = {article}, year = {2018}, keywords = {MPI artifacts,Motion artifacts,Time-of-flight}, pages = {381-396}, volume = {11205 LNCS}, id = {431f6a77-8007-3565-a24f-47eb7f2c177e}, created = {2020-10-01T06:44:41.665Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:51.833Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {Scene motion, multiple reflections, and sensor noise introduce artifacts in the depth reconstruction performed by time-of-flight cameras. We propose a two-stage, deep-learning approach to address all of these sources of artifacts simultaneously. We also introduce FLAT, a synthetic dataset of 2000 ToF measurements that capture all of these nonidealities, and allows to simulate different camera hardware. Using the Kinect 2 camera as a baseline, we show improved reconstruction errors over state-of-the-art methods, on both simulated and real data.}, bibtype = {article}, author = {Guo, Qi and Frosio, Iuri and Gallo, Orazio and Zickler, Todd and Kautz, Jan}, doi = {10.1007/978-3-030-01246-5_23}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Tackling 3D ToF Artifacts Through Learning and the FLAT Dataset}, type = {inproceedings}, year = {2018}, keywords = {MPI artifacts,Motion artifacts,Time-of-flight}, id = {53340095-d985-34a2-84d5-298b9d284f9d}, created = {2020-10-01T06:44:41.863Z}, file_attached = {false}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-01T06:44:41.863Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Scene motion, multiple reflections, and sensor noise introduce artifacts in the depth reconstruction performed by time-of-flight cameras. We propose a two-stage, deep-learning approach to address all of these sources of artifacts simultaneously. We also introduce FLAT, a synthetic dataset of 2000 ToF measurements that capture all of these nonidealities, and allows to simulate different camera hardware. Using the Kinect 2 camera as a baseline, we show improved reconstruction errors over state-of-the-art methods, on both simulated and real data.}, bibtype = {inproceedings}, author = {Guo, Qi and Frosio, Iuri and Gallo, Orazio and Zickler, Todd and Kautz, Jan}, doi = {10.1007/978-3-030-01246-5_23}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Learn-to-score: Efficient 3D scene exploration by predicting view utility}, type = {article}, year = {2018}, keywords = {3D CNN,3D reconstruction,Active vision,Exploration}, pages = {455-472}, volume = {11219 LNCS}, id = {c39675a5-4435-37ca-bebb-0720e80b153a}, created = {2020-10-05T10:26:00.895Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.286Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Hepp2018}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,993054b1-3bdd-483c-a844-4576869f66e9,07e07de9-bcac-4934-a82b-d0aff540e56d,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Camera equipped drones are nowadays being used to explore large scenes and reconstruct detailed 3D maps. When free space in the scene is approximately known, an offline planner can generate optimal plans to efficiently explore the scene. However, for exploring unknown scenes, the planner must predict and maximize usefulness of where to go on the fly. Traditionally, this has been achieved using handcrafted utility functions. We propose to learn a better utility function that predicts the usefulness of future viewpoints. Our learned utility function is based on a 3D convolutional neural network. This network takes as input a novel volumetric scene representation that implicitly captures previously visited viewpoints and generalizes to new scenes. We evaluate our method on several large 3D models of urban scenes using simulated depth cameras. We show that our method outperforms existing utility measures in terms of reconstruction performance and is robust to sensor noise.}, bibtype = {article}, author = {Hepp, Benjamin and Dey, Debadeepta and Sinha, Sudipta N. and Kapoor, Ashish and Joshi, Neel and Hilliges, Otmar}, doi = {10.1007/978-3-030-01267-0_27}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {PIXOR: Real-time 3D Object Detection from Point Clouds}, type = {article}, year = {2018}, pages = {7652-7660}, id = {e3c2d78a-c8be-36fe-af18-60d798ec39a8}, created = {2020-10-15T09:39:12.558Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-26T10:02:11.153Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We address the problem of real-time 3D object detection from point clouds in the context of autonomous driving. Speed is critical as detection is a necessary component for safety. Existing approaches are, however, expensive in computation due to high dimensionality of point clouds. We utilize the 3D data more efficiently by representing the scene from the Bird's Eye View (BEV), and propose PIXOR, a proposal-free, single-stage detector that outputs oriented 3D object estimates decoded from pixel-wise neural network predictions. The input representation, network architecture, and model optimization are specially designed to balance high accuracy and real-time efficiency. We validate PIXOR on two datasets: The KITTI BEV object detection benchmark, and a large-scale 3D vehicle detection benchmark. In both datasets we show that the proposed detector surpasses other state-of-the-art methods notably in terms of Average Precision (AP), while still runs at 10 FPS.}, bibtype = {article}, author = {Yang, Bin and Luo, Wenjie and Urtasun, Raquel}, doi = {10.1109/CVPR.2018.00798}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Efficient convolutions for real-time semantic segmentation of 3D point clouds}, type = {article}, year = {2018}, keywords = {2D,3D,Autonomous,Cloud,Convolutional,Deep,Driving,Efficient,Learning,Lidar,Network,Neural,Point,Segmentation,Semantic}, pages = {399-408}, id = {1cb7353c-9814-3ea9-9f6e-cac83a77032f}, created = {2020-10-15T09:39:12.566Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-18T09:18:33.698Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {In this work, we propose a novel voxel representation which allows for efficient, real-time processing of point clouds with deep neural networks. Our approach takes a 2D representation of a simple occupancy grid and produces fine-grained 3D segmentation. We show that our approach outperforms the state-of-the art while being an order of magnitude faster. We can perform segmentation of large outdoor scenes of size 160m x 80m in as little as 30ms. In indoor scenarios, we can segment full rooms in less than 15ms. This is crucial for robotics applications which require real-time inference for safety critical tasks.}, bibtype = {article}, author = {Zhang, Chris and Luo, Wenjie and Urtasun, Raquel}, doi = {10.1109/3DV.2018.00053}, journal = {Proceedings - 2018 International Conference on 3D Vision, 3DV 2018} }
@misc{ title = {3d-sis: 3d semantic instance segmentation of rgb-d scans}, type = {misc}, year = {2018}, source = {arXiv}, pages = {4421-4430}, id = {7d612e65-63d2-3663-a248-8b5cbf9a195a}, created = {2020-10-20T09:48:06.194Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:00:50.676Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We introduce 3D-SIS1, a novel neural network architecture for 3D semantic instance segmentation in commodity RGB-D scans. The core idea of our method is to jointly learn from both geometric and color signal, thus enabling accurate instance predictions. Rather than operate solely on 2D frames, we observe that most computer vision applications have multi-view RGB-D input available, which we leverage to construct an approach for 3D instance segmentation that effectively fuses together these multi-modal inputs. Our network leverages high-resolution RGB input by associating 2D images with the volumetric grid based on the pose alignment of the 3D reconstruction. For each image, we first extract 2D features for each pixel with a series of 2D convolutions; we then backproject the resulting feature vector to the associated voxel in the 3D grid. This combination of 2D and 3D feature learning allows significantly higher accuracy object detection and instance segmentation than state-of-the-art alternatives. We show results on both synthetic and real-world public benchmarks, achieving an improvement in mAP of over 13 on real-world data.}, bibtype = {misc}, author = {Hou, Ji and Dai, Angela and Nießner, Matthias} }
@article{ title = {GeoNet: Geometric Neural Network for Joint Depth and Surface Normal Estimation}, type = {article}, year = {2018}, pages = {283-291}, id = {3705a1dd-fa39-376d-85ad-84c95e333486}, created = {2020-11-03T13:16:20.066Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-03T13:18:08.524Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper, we propose Geometric Neural Network (GeoNet) to jointly predict depth and surface normal maps from a single image. Building on top of two-stream CNNs, our GeoNet incorporates geometric relation between depth and surface normal via the new depth-to-normal and normal-to-depth networks. Depth-to-normal network exploits the least square solution of surface normal from depth and improves its quality with a residual module. Normal-to-depth network, contrarily, refines the depth map based on the constraints from the surface normal through a kernel regression module, which has no parameter to learn. These two networks enforce the underlying model to efficiently predict depth and surface normal for high consistency and corresponding accuracy. Our experiments on NYU v2 dataset verify that our GeoNet is able to predict geometrically consistent depth and normal maps. It achieves top performance on surface normal estimation and is on par with state-of-the-art depth estimation methods.}, bibtype = {article}, author = {Qi, Xiaojuan and Liao, Renjie and Liu, Zhengzhe and Urtasun, Raquel and Jia, Jiaya}, doi = {10.1109/CVPR.2018.00037}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {CS229 Final Project - Improving LiDAR Point Cloud Classification of Urban Objects}, type = {article}, year = {2018}, id = {d8dccdad-fcb2-3924-950d-8729b13b3135}, created = {2020-11-03T13:16:20.290Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-03T13:17:12.516Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, bibtype = {article}, author = {Wang, Peggy Yuchun and Gosakti, Tristan}, number = {NeurIPS} }
@article{ title = {ODDS: Real-Time Object Detection Using Depth Sensors on Embedded GPUs}, type = {article}, year = {2018}, keywords = {Curriculum learning,Deep learning,Embedded systems,Network pruning,Object detection}, pages = {230-241}, id = {b6d96ec6-a5d8-3562-a673-b69ffc2d0931}, created = {2020-11-16T11:56:20.671Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-24T10:35:25.971Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mithun2018}, folder_uuids = {f3937f49-b3bb-4fad-a335-4fb2125beca8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Detecting objects that are carried when someone enters or exits a room is very useful for a wide range of smart building applications including safety, security, and energy efficiency. While there has been a significant amount of work on object recognition using large-scale RGB image datasets, RGB cameras are too privacy invasive in many smart building applications and they work poorly in the dark. Additionally, deep object detection networks require powerful and expensive GPUs. We propose a novel system that we call ODDS (Object Detector using a Depth Sensor) that can detect objects in real-time using only raw depth data on an embedded GPU, e.g., NVIDIA Jetson TX1. Hence, our solution is significantly less privacy invasive (even if the sensor is compromised) and less expensive, while maintaining a comparable accuracy with state of the art solutions. Specifically, we resort to training a deep convolutional neural network using raw depth images, with curriculum based learning to improve accuracy by considering the complexity and imbalance in object classes and developing a sparse coding based technique that speeds up the system ∼2x with minimal loss of accuracy. Based on a complete implementation and real-world evaluation, we see ODDS achieve 80.14% mean average precision in object detection in real-time (5-6 FPS) on a Jetson TX1.}, bibtype = {article}, author = {Mithun, Niluthpol Chowdhury and Munir, Sirajum and Guo, Karen and Shelton, Charles}, doi = {10.1109/IPSN.2018.00051}, journal = {Proceedings - 17th ACM/IEEE International Conference on Information Processing in Sensor Networks, IPSN 2018} }
@article{ title = {Deep End-to-End Time-of-Flight Imaging}, type = {article}, year = {2018}, publisher = {IEEE}, id = {a3d51e1a-fa33-3d16-8d91-b1723739fbbb}, created = {2020-11-27T06:42:00.449Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-12-07T01:26:36.071Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, bibtype = {article}, author = {Su, Shuochen and Heide, Felix and Wetzstein, Gordon and Heidrich, Wolfgang}, doi = {10.1109/CVPR.2018.00668} }
@article{ title = {PlaneRCNN: 3D plane detection and reconstruction from a single image}, type = {article}, year = {2018}, pages = {4450-4459}, id = {6bf3e100-eb62-3208-a456-1079b34d1deb}, created = {2020-12-10T09:42:26.395Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-12-11T06:48:57.761Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {This paper proposes a deep neural architecture, PlaneRCNN, that detects and reconstructs piecewise planar surfaces from a single RGB image. PlaneRCNN employs a variant of Mask R-CNN to detect planes with their plane parameters and segmentation masks. PlaneRCNN then jointly refines all the segmentation masks with a novel loss enforcing the consistency with a nearby view during training. The paper also presents a new benchmark with more fine-grained plane segmentations in the ground-truth, in which, PlaneRCNN outperforms existing state-of-the-art methods with significant margins in the plane detection, segmentation, and reconstruction metrics. PlaneRCNN makes an important step towards robust plane extraction, which would have an immediate impact on a wide range of applications including Robotics, Augmented Reality, and Virtual Reality.}, bibtype = {article}, author = {Liu, Chen and Kim, Kihwan and Gu, Jinwei and Furukawa, Yasutaka and Kautz, Jan}, journal = {arXiv} }
@article{ title = {3D object classification via spherical projections}, type = {article}, year = {2018}, pages = {566-574}, id = {51271ae6-3d72-34cf-8185-442d2d79bbb2}, created = {2021-01-22T08:56:59.991Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-22T08:57:03.623Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {In this paper, we introduce a new method for classifying 3D objects. Our main idea is to project a 3D object onto a spherical domain centered around its barycenter and develop neural network to classify the spherical projection. We introduce two complementary projections. The first captures depth variations of a 3D object, and the second captures contour-information viewed from different angles. Spherical projections combine key advantages of two main-stream 3D classification methods: image-based and 3D-based. Specifically, spherical projections are locally planar, allowing us to use massive image datasets (e.g, ImageNet) for pre-Training. Also spherical projections are similar to voxel-based methods, as they encode complete information of a 3D object in a single neural network capturing dependencies across different views. Our novel network design can fully utilize these advantages. Experimental results on ModelNet40 and ShapeNetCore show that our method is superior to prior methods.}, bibtype = {article}, author = {Cao, Zhangjie and Huang, Qixing and Karthik, Ramani}, doi = {10.1109/3DV.2017.00070}, journal = {Proceedings - 2017 International Conference on 3D Vision, 3DV 2017} }
@article{ title = {3D Object Classification Using Geometric Features and Pairwise Relationships}, type = {article}, year = {2018}, pages = {152-164}, volume = {33}, id = {d79cd182-1111-37e8-b3a9-f88d9c0ec66e}, created = {2021-01-25T08:45:25.227Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T08:45:36.922Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Object classification is a key differentiator of building information modeling (BIM) from three-dimensional (3D) computer-aided design (CAD). Incorrect object classification impedes the full exploitation of BIM models. Models prepared using domain-specific software cannot ensure correct object classification when transferred to other domains, and research on reconstruction of BIM models using spatial survey has not proved a full capability to classify objects. This research proposed an integrated approach to object classification that applied domain experts’ knowledge of shape features and pairwise relationships of 3D objects to effectively classify objects using a tailored matching algorithm. Among its contributions: the algorithms implemented for shape and spatial feature identification could process various complex 3D geometry; the method devised for compilation of the knowledge base considered both rigor and confidence of the inference; the algorithm for matching provides mathematical measurement of the object classification results. The integrated approach has been applied to classify 3D bridge objects in two models: a model prepared using incorrect object types and a model manually reconstructed using point cloud data. All these objects were successfully classified.}, bibtype = {article}, author = {Ma, Ling and Sacks, Rafael and Kattel, Uri and Bloch, Tanya}, doi = {10.1111/mice.12336}, journal = {Computer-Aided Civil and Infrastructure Engineering}, number = {2} }
@article{ title = {An egg volume measurement system based on the microsoft kinect}, type = {article}, year = {2018}, keywords = {3D range camera,3D reconstruction,Feature recognition,Geometric modelling,Image processing,Volume estimation}, volume = {18}, id = {88ba5b18-ef2b-347a-9613-7fce155953f4}, created = {2021-01-25T14:53:33.659Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T14:53:45.408Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f}, private_publication = {false}, abstract = {Measuring the volume of bird eggs is a very important task for the poultry industry and ornithological research due to the high revenue generated by the industry. In this paper, we describe a prototype of a new metrological system comprising a 3D range camera, Microsoft Kinect (Version 2) and a point cloud post-processing algorithm for the estimation of the egg volume. The system calculates the egg volume directly from the egg shape parameters estimated from the least-squares method in which the point clouds of eggs captured by the Kinect are fitted to novel geometric models of an egg in a 3D space. Using the models, the shape parameters of an egg are estimated along with the egg’s position and orientation simultaneously under the least-squares criterion. Four sets of experiments were performed to verify the functionality and the performance of the system, while volumes estimated from the conventional water displacement method and the point cloud captured by a survey-grade laser scanner serve as references. The results suggest that the method is straightforward, feasible and reliable with an average egg volume estimation accuracy 93.3% when compared to the reference volumes. As a prototype, the software part of the system was implemented in a post-processing mode. However, as the proposed processing techniques is computationally efficient, the prototype can be readily transformed into a real-time egg volume system.}, bibtype = {article}, author = {Chan, Ting On and Lichti, Derek D. and Jahraus, Adam and Esfandiari, Hooman and Lahamy, Herve and Steward, Jeremy and Glanzer, Matthew}, doi = {10.3390/s18082454}, journal = {Sensors (Switzerland)}, number = {8} }
@article{ title = {Minimum elastic bounding box algorithm for dimension detection of 3D objects: A case of airline baggage measurement}, type = {article}, year = {2018}, pages = {1313-1321}, volume = {12}, id = {ee6b49f5-f492-3281-8a05-6aeea31a44ab}, created = {2021-01-28T07:55:29.954Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T07:55:41.590Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,471f331d-8560-4b9e-b910-e5f849b6fcfd}, private_publication = {false}, abstract = {Motivated by the interference of appendages in airline baggage dimension detection using three-dimensional (3D) point cloud, a minimum elastic bounding box (MEBB) algorithm for dimension detection of 3D objects is developed. The baggage dimension measurements using traditional bounding box method or shape fitting method can cause large measurements due to the interference of appendages. Starting from the idea of 'enclosing', an elastic bounding box model with the deformable surface is established. On the basis of using principal component analysis to obtain the main direction of the bounding box, the elastic rules for deformable surfaces are developed so as to produce a large elastic force when it comes into contact with the main body part and to produce a small elastic force when it comes into contact with the appendages part. The airline baggage measurement shows how to use MEBB for dimension detection, especially for the processing of isotropic density distribution, the elasticity computing and the adaptive adjustment of elasticity. Results on typical baggage samples, comparisons to other methods, and error distribution experiments with different algorithm parameters show that the authors' method can reliably obtain the size of the main body part of the object under the interference of appendages.}, bibtype = {article}, author = {Gao, Qingji and Yin, Deyu and Luo, Qijun and Liu, Jingbin}, doi = {10.1049/iet-ipr.2017.0695}, journal = {IET Image Processing}, number = {8} }
@article{ title = {A minimalist approach to type-agnostic detection of quadrics in point clouds}, type = {article}, year = {2018}, id = {046fb66b-0e37-3da9-8528-7ae7cce79f30}, created = {2021-01-28T07:55:29.960Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T05:48:45.070Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d1d4f278-1b55-4513-a831-800e4d860e30}, private_publication = {false}, abstract = {This paper proposes a segmentation-free, automatic and efficient procedure to detect general geometric quadric forms in point clouds, where clutter and occlusions are inevitable. Our everyday world is dominated by man-made objects which are designed using 3D primitives (such as planes, cones, spheres, cylinders, etc.). These objects are also omnipresent in industrial environments. This gives rise to the possibility of abstracting 3D scenes through primitives, thereby positions these geometric forms as an integral part of perception and high level 3D scene understanding. As opposed to state-of-the-art, where a tailored algorithm treats each primitive type separately, we propose to encapsulate all types in a single robust detection procedure. At the center of our approach lies a closed form 3D quadric fit, operating in both primal & dual spaces and requiring as low as 4 oriented-points. Around this fit, we design a novel, local null-space voting strategy to reduce the 4-point case to 3. Voting is coupled with the famous RANSAC and makes our algorithm orders of magnitude faster than its conventional counterparts. This is the first method capable of performing a generic cross-type multi-object primitive detection in difficult scenes. Results on synthetic and real datasets support the validity of our method.}, bibtype = {article}, author = {Birdal, Tolga and Busam, Benjamin and Navab, Nassir and Ilic, Slobodan and Sturm, Peter}, journal = {arXiv} }
@inproceedings{ title = {Shufflenet V2: Practical guidelines for efficient cnn architecture design}, type = {inproceedings}, year = {2018}, keywords = {CNN architecture design,Efficiency,Practical}, pages = {122-138}, volume = {11218 LNCS}, websites = {http://arxiv.org/abs/1807.11164}, month = {7}, publisher = {Springer Verlag}, day = {30}, id = {6fa2f114-652d-3e67-99de-36502ef9aa57}, created = {2021-01-29T12:21:33.865Z}, accessed = {2021-01-29}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:52.305Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Currently, the neural network architecture design is mostly guided by the indirect metric of computation complexity, i.e., FLOPs. However, the direct metric, e.g., speed, also depends on the other factors such as memory access cost and platform characterics. Thus, this work proposes to evaluate the direct metric on the target platform, beyond only considering FLOPs. Based on a series of controlled experiments, this work derives several practical guidelines for efficient network design. Accordingly, a new architecture is presented, called ShuffleNet V2. Comprehensive ablation experiments verify that our model is the state-of-the-art in terms of speed and accuracy tradeoff.}, bibtype = {inproceedings}, author = {Ma, Ningning and Zhang, Xiangyu and Zheng, Hai Tao and Sun, Jian}, doi = {10.1007/978-3-030-01264-9_8}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {AMC: AutoML for model compression and acceleration on mobile devices}, type = {inproceedings}, year = {2018}, keywords = {AutoML,CNN acceleration,Mobile vision,Model compression,Reinforcement learning}, pages = {815-832}, volume = {11211 LNCS}, websites = {http://arxiv.org/abs/1802.03494}, month = {2}, publisher = {Springer Verlag}, day = {9}, id = {697fcea0-09e6-3a32-af1a-5b0ce5636d9e}, created = {2021-02-09T07:26:00.332Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:55.700Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Model compression is an effective technique to efficiently deploy neural network models on mobile devices which have limited computation resources and tight power budgets. Conventional model compression techniques rely on hand-crafted features and require domain experts to explore the large design space trading off among model size, speed, and accuracy, which is usually sub-optimal and time-consuming. In this paper, we propose AutoML for Model Compression (AMC) which leverages reinforcement learning to efficiently sample the design space and can improve the model compression quality. We achieved state-of-the-art model compression results in a fully automated way without any human efforts. Under 4 × FLOPs reduction, we achieved 2.7% better accuracy than the hand-crafted model compression method for VGG-16 on ImageNet. We applied this automated, push-the-button compression pipeline to MobileNet-V1 and achieved a speedup of 1.53 × on the GPU (Titan Xp) and 1.95 × on an Android phone (Google Pixel 1), with negligible loss of accuracy.}, bibtype = {inproceedings}, author = {He, Yihui and Lin, Ji and Liu, Zhijian and Wang, Hanrui and Li, Li Jia and Han, Song}, doi = {10.1007/978-3-030-01234-2_48}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {HAQ: Hardware-Aware Automated Quantization with Mixed Precision}, type = {article}, year = {2018}, websites = {https://arxiv.org/abs/1811.08886v3}, month = {11}, day = {21}, id = {2213d6ba-bba9-3ae1-aeb4-c57045ee0cc9}, created = {2021-02-09T07:27:33.027Z}, accessed = {2021-02-09}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:27:33.027Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Wang, Kuan and Liu, Zhijian and Lin, Yujun and Lin, Ji and Han, Song} }
@inproceedings{ title = {Shufflenet V2: Practical guidelines for efficient cnn architecture design}, type = {inproceedings}, year = {2018}, keywords = {CNN architecture design,Efficiency,Practical}, pages = {122-138}, volume = {11218 LNCS}, websites = {http://arxiv.org/abs/1807.11164}, month = {7}, publisher = {Springer Verlag}, day = {30}, id = {6593d202-b33c-3d3c-b519-1e3048a64c08}, created = {2021-02-09T07:31:31.361Z}, accessed = {2021-02-09}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:54.558Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Currently, the neural network architecture design is mostly guided by the indirect metric of computation complexity, i.e., FLOPs. However, the direct metric, e.g., speed, also depends on the other factors such as memory access cost and platform characterics. Thus, this work proposes to evaluate the direct metric on the target platform, beyond only considering FLOPs. Based on a series of controlled experiments, this work derives several practical guidelines for efficient network design. Accordingly, a new architecture is presented, called ShuffleNet V2. Comprehensive ablation experiments verify that our model is the state-of-the-art in terms of speed and accuracy tradeoff.}, bibtype = {inproceedings}, author = {Ma, Ningning and Zhang, Xiangyu and Zheng, Hai Tao and Sun, Jian}, doi = {10.1007/978-3-030-01264-9_8}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {MobileNetV2: Inverted Residuals and Linear Bottlenecks}, type = {inproceedings}, year = {2018}, pages = {4510-4520}, websites = {http://arxiv.org/abs/1801.04381}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {3fd0c02f-c06e-3d54-b40f-f8bf9ab33bd1}, created = {2021-02-09T07:32:19.607Z}, accessed = {2021-02-09}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:54.405Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3. is based on an inverted residual structure where the shortcut connections are between the thin bottleneck layers. The intermediate expansion layer uses lightweight depthwise convolutions to filter features as a source of non-linearity. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on ImageNet [1] classification, COCO object detection [2], VOC image segmentation [3]. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as actual latency, and the number of parameters.}, bibtype = {inproceedings}, author = {Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang Chieh}, doi = {10.1109/CVPR.2018.00474}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices}, type = {inproceedings}, year = {2018}, pages = {6848-6856}, websites = {https://arxiv.org/abs/1707.01083v2}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {70e1c74e-0a8a-397d-92f1-ad3d6db1df0b}, created = {2021-02-09T07:33:37.002Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:54.247Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {We introduce an extremely computation-efficient CNN architecture named ShuffleNet, which is designed specially for mobile devices with very limited computing power (e.g., 10-150 MFLOPs). The new architecture utilizes two new operations, pointwise group convolution and channel shuffle, to greatly reduce computation cost while maintaining accuracy. Experiments on ImageNet classification and MS COCO object detection demonstrate the superior performance of ShuffleNet over other structures, e.g. lower top-1 error (absolute 7.8%) than recent MobileNet [12] on ImageNet classification task, under the computation budget of 40 MFLOPs. On an ARM-based mobile device, ShuffleNet achieves ~13Ã - actual speedup over AlexNet while maintaining comparable accuracy.}, bibtype = {inproceedings}, author = {Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian}, doi = {10.1109/CVPR.2018.00716}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {AMC: AutoML for model compression and acceleration on mobile devices}, type = {inproceedings}, year = {2018}, keywords = {AutoML,CNN acceleration,Mobile vision,Model compression,Reinforcement learning}, pages = {815-832}, volume = {11211 LNCS}, websites = {http://arxiv.org/abs/1802.03494}, month = {2}, publisher = {Springer Verlag}, day = {9}, id = {6a97fb88-9256-33a9-9c08-e6dcf3d58891}, created = {2021-02-09T07:45:52.242Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:45:55.264Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Model compression is an effective technique to efficiently deploy neural network models on mobile devices which have limited computation resources and tight power budgets. Conventional model compression techniques rely on hand-crafted features and require domain experts to explore the large design space trading off among model size, speed, and accuracy, which is usually sub-optimal and time-consuming. In this paper, we propose AutoML for Model Compression (AMC) which leverages reinforcement learning to efficiently sample the design space and can improve the model compression quality. We achieved state-of-the-art model compression results in a fully automated way without any human efforts. Under 4 × FLOPs reduction, we achieved 2.7% better accuracy than the hand-crafted model compression method for VGG-16 on ImageNet. We applied this automated, push-the-button compression pipeline to MobileNet-V1 and achieved a speedup of 1.53 × on the GPU (Titan Xp) and 1.95 × on an Android phone (Google Pixel 1), with negligible loss of accuracy.}, bibtype = {inproceedings}, author = {He, Yihui and Lin, Ji and Liu, Zhijian and Wang, Hanrui and Li, Li Jia and Han, Song}, doi = {10.1007/978-3-030-01234-2_48}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {HAQ: Hardware-Aware Automated Quantization with Mixed Precision}, type = {article}, year = {2018}, keywords = {Deep Learning,Vision Applications and Systems}, pages = {8604-8612}, volume = {2019-June}, websites = {http://arxiv.org/abs/1811.08886}, month = {11}, publisher = {IEEE Computer Society}, day = {21}, id = {fc9c6948-928a-3845-a774-7716b9280ed8}, created = {2021-02-09T07:46:21.758Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T14:17:31.215Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Model quantization is a widely used technique to compress and accelerate deep neural network (DNN) inference. Emergent DNN hardware accelerators begin to support mixed precision (1-8 bits) to further improve the computation efficiency, which raises a great challenge to find the optimal bitwidth for each layer: it requires domain experts to explore the vast design space trading off among accuracy, latency, energy, and model size, which is both time-consuming and sub-optimal. Conventional quantization algorithm ignores the different hardware architectures and quantizes all the layers in a uniform way. In this paper, we introduce the Hardware-Aware Automated Quantization (HAQ) framework which leverages the reinforcement learning to automatically determine the quantization policy, and we take the hardware accelerator's feedback in the design loop. Rather than relying on proxy signals such as FLOPs and model size, we employ a hardware simulator to generate direct feedback signals (latency and energy) to the RL agent. Compared with conventional methods, our framework is fully automated and can specialize the quantization policy for different neural network architectures and hardware architectures. Our framework effectively reduced the latency by 1.4-1.95x and the energy consumption by 1.9x with negligible loss of accuracy compared with the fixed bitwidth (8 bits) quantization. Our framework reveals that the optimal policies on different hardware architectures (i.e., edge and cloud architectures) under different resource constraints (i.e., latency, energy and model size) are drastically different. We interpreted the implication of different quantization policies, which offer insights for both neural network architecture design and hardware architecture design.}, bibtype = {article}, author = {Wang, Kuan and Liu, Zhijian and Lin, Yujun and Lin, Ji and Han, Song}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design}, type = {article}, year = {2018}, keywords = {CNN architecture design,Efficiency,Practical}, pages = {122-138}, volume = {11218 LNCS}, websites = {http://arxiv.org/abs/1807.11164}, month = {7}, publisher = {Springer Verlag}, day = {30}, id = {92937756-6d5c-3c07-9223-5405721efb9c}, created = {2021-02-09T07:46:57.453Z}, accessed = {2021-02-09}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T14:17:32.692Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Currently, the neural network architecture design is mostly guided by the \emphindirect metric of computation complexity, i.e., FLOPs. However, the \emphdirect metric, e.g., speed, also depends on the other factors such as memory access cost and platform characterics. Thus, this work proposes to evaluate the direct metric on the target platform, beyond only considering FLOPs. Based on a series of controlled experiments, this work derives several practical \emphguidelines for efficient network design. Accordingly, a new architecture is presented, called \emphShuffleNet V2. Comprehensive ablation experiments verify that our model is the state-of-the-art in terms of speed and accuracy tradeoff.}, bibtype = {article}, author = {Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {MobileNetV2: Inverted Residuals and Linear Bottlenecks}, type = {article}, year = {2018}, pages = {4510-4520}, websites = {http://arxiv.org/abs/1801.04381}, month = {1}, publisher = {IEEE Computer Society}, day = {12}, id = {7fac16b0-9c23-39a6-b127-d6987e2d3210}, created = {2021-02-09T07:47:05.201Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-31T07:21:16.753Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Sandler2018}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3. The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters}, bibtype = {article}, author = {Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices}, type = {inproceedings}, year = {2018}, pages = {6848-6856}, websites = {https://arxiv.org/abs/1707.01083v2}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {10a21ba8-97cf-3d10-a25c-fe02d7e846a0}, created = {2021-02-09T07:47:12.378Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-11T14:12:12.381Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {We introduce an extremely computation-efficient CNN architecture named ShuffleNet, which is designed specially for mobile devices with very limited computing power (e.g., 10-150 MFLOPs). The new architecture utilizes two new operations, pointwise group convolution and channel shuffle, to greatly reduce computation cost while maintaining accuracy. Experiments on ImageNet classification and MS COCO object detection demonstrate the superior performance of ShuffleNet over other structures, e.g. lower top-1 error (absolute 7.8%) than recent MobileNet [12] on ImageNet classification task, under the computation budget of 40 MFLOPs. On an ARM-based mobile device, ShuffleNet achieves ~13Ã - actual speedup over AlexNet while maintaining comparable accuracy.}, bibtype = {inproceedings}, author = {Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian}, doi = {10.1109/CVPR.2018.00716}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {SpiderCNN: Deep learning on point sets with parameterized convolutional filters}, type = {inproceedings}, year = {2018}, keywords = {Convolutional neural network,Parametrized convolutional filters,Point clouds}, pages = {90-105}, volume = {11212 LNCS}, websites = {http://arxiv.org/abs/1803.11527}, month = {3}, publisher = {Springer Verlag}, day = {30}, id = {9f2c0980-65dd-3fea-9f8e-c6090547a3bf}, created = {2021-02-09T08:12:13.313Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:12:17.530Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Deep neural networks have enjoyed remarkable success for various vision tasks, however it remains challenging to apply CNNs to domains lacking a regular underlying structures such as 3D point clouds. Towards this we propose a novel convolutional architecture, termed SpiderCNN, to efficiently extract geometric features from point clouds. SpiderCNN is comprised of units called SpiderConv, which extend convolutional operations from regular grids to irregular point sets that can be embedded in Rn, by parametrizing a family of convolutional filters. We design the filter as a product of a simple step function that captures local geodesic information and a Taylor polynomial that ensures the expressiveness. SpiderCNN inherits the multi-scale hierarchical architecture from classical CNNs, which allows it to extract semantic deep features. Experiments on ModelNet40 demonstrate that SpiderCNN achieves state-of-the-art accuracy 92.4% on standard benchmarks, and shows competitive performance on segmentation task.}, bibtype = {inproceedings}, author = {Xu, Yifan and Fan, Tianqi and Xu, Mingye and Zeng, Long and Qiao, Yu}, doi = {10.1007/978-3-030-01237-3_6}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Frustum PointNets for 3D Object Detection from RGB-D Data}, type = {inproceedings}, year = {2018}, pages = {918-927}, websites = {http://arxiv.org/abs/1711.08488}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {776617e3-ad0a-39da-a456-c0ff8fd6358d}, created = {2021-02-09T08:16:21.582Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:16:27.245Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {In this work, we study 3D object detection from RGBD data in both indoor and outdoor scenes. While previous methods focus on images or 3D voxels, often obscuring natural 3D patterns and invariances of 3D data, we directly operate on raw point clouds by popping up RGB-D scans. However, a key challenge of this approach is how to efficiently localize objects in point clouds of large-scale scenes (region proposal). Instead of solely relying on 3D proposals, our method leverages both mature 2D object detectors and advanced 3D deep learning for object localization, achieving efficiency as well as high recall for even small objects. Benefited from learning directly in raw point clouds, our method is also able to precisely estimate 3D bounding boxes even under strong occlusion or with very sparse points. Evaluated on KITTI and SUN RGB-D 3D detection benchmarks, our method outperforms the state of the art by remarkable margins while having real-time capability.}, bibtype = {inproceedings}, author = {Qi, Charles R. and Liu, Wei and Wu, Chenxia and Su, Hao and Guibas, Leonidas J.}, doi = {10.1109/CVPR.2018.00102}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {Recurrent Slice Networks for 3D Segmentation of Point Clouds}, type = {inproceedings}, year = {2018}, pages = {2626-2635}, websites = {http://arxiv.org/abs/1802.04402}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {c201b594-edc5-301e-8ebf-2d915a4388aa}, created = {2021-02-09T08:17:51.065Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:18:00.116Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e}, private_publication = {false}, abstract = {Point clouds are an efficient data format for 3D data. However, existing 3D segmentation methods for point clouds either do not model local dependencies [21] or require added computations [14, 23]. This work presents a novel 3D segmentation framework, RSNet1, to efficiently model local structures in point clouds. The key component of the RSNet is a lightweight local dependency module. It is a combination of a novel slice pooling layer, Recurrent Neural Network (RNN) layers, and a slice unpooling layer. The slice pooling layer is designed to project features of unordered points onto an ordered sequence of feature vectors so that traditional end-to-end learning algorithms (RNNs) can be applied. The performance of RSNet is validated by comprehensive experiments on the S3DIS[1], ScanNet[3], and ShapeNet [34] datasets. In its simplest form, RSNets surpass all previous state-of-the-art methods on these benchmarks. And comparisons against previous state-of-the-art methods [21, 23] demonstrate the efficiency of RSNets.}, bibtype = {inproceedings}, author = {Huang, Qiangui and Wang, Weiyue and Neumann, Ulrich}, doi = {10.1109/CVPR.2018.00278}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {PointCNN: Convolution On $\mathcalX$-Transformed Points}, type = {article}, year = {2018}, keywords = {Irregular Domains,KEYWORDS Convolutional Neural Network,Neu-ral networks,Shape analysis,Sparse Data}, pages = {1-11}, websites = {http://arxiv.org/abs/1801.07791}, month = {1}, day = {23}, id = {bf8a2f3b-7c2d-3cb2-aab5-e644544dd910}, created = {2021-02-09T08:20:08.184Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:20:11.211Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We present a simple and general framework for feature learning from point clouds. The key to the success of CNNs is the convolution operator that is capable of leveraging spatially-local correlation in data represented densely in grids (e.g. images). However, point clouds are irregular and unordered, thus directly convolving kernels against features associated with the points, will result in desertion of shape information and variance to point ordering. To address these problems, we propose to learn an $\mathcalX$-transformation from the input points, to simultaneously promote two causes. The first is the weighting of the input features associated with the points, and the second is the permutation of the points into a latent and potentially canonical order. Element-wise product and sum operations of the typical convolution operator are subsequently applied on the $\mathcalX$-transformed features. The proposed method is a generalization of typical CNNs to feature learning from point clouds, thus we call it PointCNN. Experiments show that PointCNN achieves on par or better performance than state-of-the-art methods on multiple challenging benchmark datasets and tasks.}, bibtype = {article}, author = {Li, Yangyan and Bu, Rui and Sun, Mingchao and Wu, Wei and Di, Xinhan and Chen, Baoquan}, journal = {arXiv} }
@article{ title = {GPU-Accelerated Next-Best-View Coverage of Articulated Scenes}, type = {article}, year = {2018}, pages = {8315-8322}, id = {c7daa973-249f-3611-87e8-ebc226efecb1}, created = {2021-02-09T08:36:10.881Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:42.973Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9}, private_publication = {false}, abstract = {Next-best-view algorithms are commonly used for covering known scenes, for example in search, maintenance, and mapping tasks. In this paper, we consider the problem of planning a strategy for covering articulated environments where the robot also has to manipulate objects to inspect obstructed areas. This problem is particularly challenging due to the many degrees of freedom resulting from the articulation. We propose to exploit graphics processing units present in many embedded devices to parallelize the computations of a greedy next-best-view approach. We implemented algorithms for costmap computation, path planning, as well as simulation and evaluation of viewpoint candidates in OpenGL for Embedded Systems and benchmarked the implementations on multiple device classes ranging from smartphones to multi-GPU servers. We introduce a heuristic for estimating a utility map from images rendered with strategically placed spherical cameras and show in simulation experiments that robots can successfully explore complex articulated scenes with our system.}, bibtype = {article}, author = {Obwald, Stefan and Bennewitz, Maren}, doi = {10.1109/IROS.2018.8594054}, journal = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {New discretization method applied to NBV problem: Semioctree}, type = {article}, year = {2018}, pages = {1-20}, volume = {13}, id = {f7984d82-a4b0-3224-b913-da556ea10ece}, created = {2021-02-09T08:36:10.882Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:46.374Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9}, private_publication = {false}, abstract = {This paper presents a discretization methodology applied to the NBV (Next Best View) problem, which consists of determining the heuristical best position of the next scan. This new methodology is a hybrid process between a homogenous voxelization and an octree structure that preserves the advantages of both methods. An octree structure is not directly applicable to the NBV problem: As the point cloud grows with every successive scanning, the limits and position of the discretization, octree structure must coincide, in order to transfer the information from one scan to the next. This problem is solved by applying a first coarse voxelization, followed by the division of each voxel in an octree structure. In addition, a previous methodology for solving the NBV problem has been adapted to make use of this novel approach. Results show that the new method is three times faster than the homogenous voxelization for a maximum resolution of 0.2m. For this target resolution of 0.2m, the number of voxels/octants in the discretization is reduced approximately by a 400%, from 35.360 to 8.937 for the study case presented. Copyright:}, bibtype = {article}, author = {González-DeSantos, L. M. and Martínez-Sánchez, J. and González-Jorge, H. and Díaz-Vilariño, L. and Riveiro, B.}, doi = {10.1371/journal.pone.0206259}, journal = {PLoS ONE}, number = {11} }
@article{ title = {A comparison of volumetric information gain metrics for active 3D object reconstruction}, type = {article}, year = {2018}, keywords = {3D reconstruction,Active vision,Information gain}, pages = {197-208}, volume = {42}, publisher = {Springer US}, id = {13dec112-a302-350b-bc39-d4197861312d}, created = {2021-02-09T08:36:10.883Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:32.810Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Delmerico2018}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9,5439d198-93d5-4603-a7ce-201d423f231e,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {In this paper, we investigate the following question: when performing next best view selection for volumetric 3D reconstruction of an object by a mobile robot equipped with a dense (camera-based) depth sensor, what formulation of information gain is best? To address this question, we propose several new ways to quantify the volumetric information (VI) contained in the voxels of a probabilistic volumetric map, and compare them to the state of the art with extensive simulated experiments. Our proposed formulations incorporate factors such as visibility likelihood and the likelihood of seeing new parts of the object. The results of our experiments allow us to draw some clear conclusions about the VI formulations that are most effective in different mobile-robot reconstruction scenarios. To the best of our knowledge, this is the first comparative survey of VI formulation performance for active 3D object reconstruction. Additionally, our modular software framework is adaptable to other robotic platforms and general reconstruction problems, and we release it open source for autonomous reconstruction tasks.}, bibtype = {article}, author = {Delmerico, Jeffrey and Isler, Stefan and Sabzevari, Reza and Scaramuzza, Davide}, doi = {10.1007/s10514-017-9634-0}, journal = {Autonomous Robots}, number = {2} }
@article{ title = {IHuman3D: Intelligent human body 3d reconstruction using a single flying camera}, type = {article}, year = {2018}, keywords = {Flying Camera,Human 3D Reconstruction,Next Best View,TSDF}, pages = {1733-1741}, volume = {2}, id = {cd3f2738-32bc-3a19-af38-100dbda56a57}, created = {2021-02-09T08:36:10.893Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:48.121Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9}, private_publication = {false}, abstract = {Aiming at autonomous, adaptive and real-time human body reconstruction technique, this paper presents iHuman3D: an intelligent human body 3D reconstruction system using a single aerial robot integrated with an RGB-D camera. Specifically, we propose a real-time and active view planning strategy based on a highly efficient ray casting algorithm in GPU and a novel information gain formulation directly in TSDF. We also propose the human body reconstruction module by revising the traditional volumetric fusion pipeline with a compactly-designed non-rigid deformation for slight motion of the human target. We unify both the active view planning and human body reconstruction in the same TSDF volume-based representation. Quantitative and qualitative experiments are conducted to validate that the proposed iHuman3D system effectively removes the constraint of extra manual labor, enabling real-time and autonomous reconstruction of human body.}, bibtype = {article}, author = {Cheng, Wei and Xu, Lan and Han, Lei and Guo, Yuanfang and Fang, Lu}, doi = {10.1145/3240508.3240600}, journal = {MM 2018 - Proceedings of the 2018 ACM Multimedia Conference} }
@article{ title = {Surfel-based next best view planning}, type = {article}, year = {2018}, keywords = {Autonomous agents,computer vision for other robotic applications,motion and path planning,range sensing}, pages = {3324-3331}, volume = {3}, publisher = {IEEE}, id = {c2b8b1e2-efcd-3ec3-bbe9-81869dfcbf9b}, created = {2021-02-09T08:36:10.893Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T06:55:52.229Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9}, private_publication = {false}, abstract = {Next best view (NBV) planning is a central task for automated three-dimensional (3-D) reconstruction in robotics. The most expensive phase of NBV computation is the view simulation step, where the information gain of a large number of candidate sensor poses are estimated. Usually, information gain is related to the visibility of unknown space from the simulated viewpoint. A well-established technique is to adopt a volumetric representation of the environment and to compute the NBV from ray casting by maximizing the number of unknown visible voxels. This letter explores a novel approach for NBV planning based on surfel representation of the environment. Surfels are oriented surface elements, such as circular disks, without explicit connectivity. A new kind of surfel is introduced to represent the frontier between empty and unknown space. Surfels are extracted during 3-D reconstruction, with minimal overhead, from a KinectFusion volumetric representation. Surfel rendering is used to generate images from each simulated sensor pose. Experiments in a real robot setup are reported. The proposed approach achieves better performance than volumetric algorithms based on ray casting implemented on graphics processing unit (GPU), with comparable results in terms of reconstruction quality. Moreover, surfel-based NBV planning can be applied in larger environments as a volumetric representation is limited by GPU memory.}, bibtype = {article}, author = {Monica, Riccardo and Aleotti, Jacopo}, doi = {10.1109/LRA.2018.2852778}, journal = {IEEE Robotics and Automation Letters}, number = {4} }
@article{ title = {Contour-based next-best view planning from point cloud segmentation of unknown objects}, type = {article}, year = {2018}, keywords = {KinectFusion,Next-best view planning,Point cloud segmentation}, pages = {443-458}, volume = {42}, publisher = {Springer US}, id = {50bb6e5e-3fd7-3e74-9c4c-922a3b89500f}, created = {2021-02-09T08:36:10.990Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.415Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9}, private_publication = {false}, abstract = {A novel strategy is presented to determine the next-best view for a robot arm, equipped with a depth camera in eye-in-hand configuration, which is oriented to autonomous exploration of unknown objects. Instead of maximizing the total size of the expected unknown volume that becomes visible, the next-best view is chosen to observe the border of incomplete objects. Salient regions of space that belong to the objects are detected, without any prior knowledge, by applying a point cloud segmentation algorithm. The system uses a Kinect V2 sensor, which has not been considered in previous works on next-best view planning, and it exploits KinectFusion to maintain a volumetric representation of the environment. A low-level procedure to reduce Kinect V2 invalid points is also presented. The viability of the approach has been demonstrated in a real setup where the robot is fully autonomous. Experiments indicate that the proposed method enables the robot to actively explore the objects faster than a standard next-best view algorithm.}, bibtype = {article}, author = {Monica, Riccardo and Aleotti, Jacopo}, doi = {10.1007/s10514-017-9618-0}, journal = {Autonomous Robots}, number = {2} }
@article{ title = {Surface edge explorer (see): Planning next best views directly from 3d observations}, type = {article}, year = {2018}, pages = {6116-6123}, publisher = {IEEE}, id = {f036a300-db75-3e53-b138-c17f2f3eb723}, created = {2021-02-09T08:36:11.015Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.270Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {993054b1-3bdd-483c-a844-4576869f66e9}, private_publication = {false}, abstract = {Surveying 3D scenes is a common task in robotics. Systems can do so autonomously by iteratively obtaining measurements. This process of planning observations to improve the model of a scene is called Next Best View (NBV) planning. NBV planning approaches often use either volumetric (e.g., voxel grids) or surface (e.g., triangulated meshes) representations. Volumetric approaches generalise well between scenes as they do not depend on surface geometry but do not scale to high-resolution models of large scenes. Surface representations can obtain high-resolution models at any scale but often require tuning of unintuitive parameters or multiple survey stages. This paper presents a scene-model-free NBV planning approach with a density representation. The Surface Edge Explorer (SEE) uses the density of current measurements to detect and explore observed surface boundaries. This approach is shown experimentally to provide better surface coverage in lower computation time than the evaluated state-of-The-Art volumetric approaches while moving equivalent distances.}, bibtype = {article}, author = {Border, Rowan and Gammell, Jonathan D. and Newman, Paul}, journal = {arXiv} }
@article{ title = {Quantizing deep convolutional networks for efficient inference: A whitepaper}, type = {article}, year = {2018}, websites = {http://arxiv.org/abs/1806.08342}, month = {6}, publisher = {arXiv}, day = {21}, id = {5ba3fe39-6829-3171-a455-319489cf3a08}, created = {2021-02-09T12:51:54.784Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T12:51:57.737Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We present an overview of techniques for quantizing convolutional neural networks for inference with integer weights and activations. Per-channel quantization of weights and per-layer quantization of activations to 8-bits of precision post-training produces classification accuracies within 2% of floating point networks for a wide variety of CNN architectures. Model sizes can be reduced by a factor of 4 by quantizing weights to 8-bits, even when 8-bit arithmetic is not supported. This can be achieved with simple, post training quantization of weights.We benchmark latencies of quantized networks on CPUs and DSPs and observe a speedup of 2x-3x for quantized implementations compared to floating point on CPUs. Speedups of up to 10x are observed on specialized processors with fixed point SIMD capabilities, like the Qualcomm QDSPs with HVX. Quantization-aware training can provide further improvements, reducing the gap to floating point to 1% at 8-bit precision. Quantization-aware training also allows for reducing the precision of weights to four bits with accuracy losses ranging from 2% to 10%, with higher accuracy drop for smaller networks.We introduce tools in TensorFlow and TensorFlowLite for quantizing convolutional networks and review best practices for quantization-aware training to obtain high accuracy with quantized weights and activations. We recommend that per-channel quantization of weights and per-layer quantization of activations be the preferred quantization scheme for hardware acceleration and kernel optimization. We also propose that future processors and hardware accelerators for optimized inference support precisions of 4, 8 and 16 bits.}, bibtype = {article}, author = {Krishnamoorthi, Raghuraman}, journal = {arXiv} }
@inproceedings{ title = {Shufflenet V2: Practical guidelines for efficient cnn architecture design}, type = {inproceedings}, year = {2018}, keywords = {CNN architecture design,Efficiency,Practical}, pages = {122-138}, volume = {11218 LNCS}, websites = {http://arxiv.org/abs/1807.11164}, month = {7}, publisher = {Springer Verlag}, day = {30}, id = {0dda81d9-0839-3069-813b-8c3c87b8450e}, created = {2021-02-09T14:16:31.049Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-11T14:12:12.569Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Currently, the neural network architecture design is mostly guided by the indirect metric of computation complexity, i.e., FLOPs. However, the direct metric, e.g., speed, also depends on the other factors such as memory access cost and platform characterics. Thus, this work proposes to evaluate the direct metric on the target platform, beyond only considering FLOPs. Based on a series of controlled experiments, this work derives several practical guidelines for efficient network design. Accordingly, a new architecture is presented, called ShuffleNet V2. Comprehensive ablation experiments verify that our model is the state-of-the-art in terms of speed and accuracy tradeoff.}, bibtype = {inproceedings}, author = {Ma, Ningning and Zhang, Xiangyu and Zheng, Hai Tao and Sun, Jian}, doi = {10.1007/978-3-030-01264-9_8}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {6D pose estimation using an improved method based on point pair features}, type = {article}, year = {2018}, keywords = {3D computer vision,3D object recognition,6D pose estimation,Object detection,Range image,Robotics}, pages = {455-460}, id = {7a124ee7-80e2-38e3-b94f-6474e127a5ab}, created = {2021-02-09T17:05:46.615Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:29.005Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d1d4f278-1b55-4513-a831-800e4d860e30}, private_publication = {false}, abstract = {The Point Pair Feature [4] has been one of the most successful 6D pose estimation method among model-based approaches as an efficient, integrated and compromise alternative to the traditional local and global pipelines. During the last years, several variations of the algorithm have been proposed. Among these extensions, the solution introduced by Hinterstoisser et al. [6] is a major contribution. This work presents a variation of this PPF method applied to the SIXD Challenge datasets presented at the 3rd International Workshop on Recovering 6D Object Pose held at the ICCV 2017. We report an average recall of 0.77 for all datasets and overall recall of 0.82, 0.67, 0.85, 0.37, 0.97 and 0.96 for hinterstoisser, tless, tudlight, rutgers, tejani and doumanoglou datasets, respectively.}, bibtype = {article}, author = {Vidal, Joel and Lin, Chyi Yeu and Martí, Robert}, journal = {arXiv} }
@article{ title = {6D pose estimation using an improved method based on point pair features}, type = {article}, year = {2018}, keywords = {3D computer vision,3D object recognition,6D pose estimation,Object detection,Range image,Robotics}, pages = {455-460}, id = {61c9b4e8-e8a0-32c0-86cb-83d36c2c6d83}, created = {2021-02-09T17:05:47.344Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:32.073Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The Point Pair Feature [4] has been one of the most successful 6D pose estimation method among model-based approaches as an efficient, integrated and compromise alternative to the traditional local and global pipelines. During the last years, several variations of the algorithm have been proposed. Among these extensions, the solution introduced by Hinterstoisser et al. [6] is a major contribution. This work presents a variation of this PPF method applied to the SIXD Challenge datasets presented at the 3rd International Workshop on Recovering 6D Object Pose held at the ICCV 2017. We report an average recall of 0.77 for all datasets and overall recall of 0.82, 0.67, 0.85, 0.37, 0.97 and 0.96 for hinterstoisser, tless, tudlight, rutgers, tejani and doumanoglou datasets, respectively.}, bibtype = {article}, author = {Vidal, Joel and Lin, Chyi Yeu and Martí, Robert}, journal = {arXiv} }
@article{ title = {Point pair features based object recognition with improved training pipeline}, type = {article}, year = {2018}, keywords = {Object detection,Point pair feature (PPF),Pose estimation}, pages = {357-366}, volume = {10985 LNAI}, id = {ee386124-0a96-3871-b426-8075e15307c4}, created = {2021-02-10T05:48:44.411Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T05:49:02.894Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d1d4f278-1b55-4513-a831-800e4d860e30}, private_publication = {false}, abstract = {PPF (point pair feature) is a widely used framework in object detection and pose estimation. However, it is computational expensive and sensitive to cluster and occlusions. In this paper, we propose a new training pipeline for PPF which makes use of the visibility information of point pairs, yet with no extra computation cost. We also design a strategy to employ plane features to make PPF more discriminative and efficient. Our experiment results show that our method achieves competitive results compared with some state-of-the-art methods.}, bibtype = {article}, author = {Zhu, Yuke and Zhang, Xu and Zhu, Limin and Cai, Yongkai}, doi = {10.1007/978-3-319-97589-4_30}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {A performance evaluation of point pair features}, type = {article}, year = {2018}, keywords = {Feature description,Object detection,Object recognition,PPF,Point pair features,Pose estimation}, pages = {66-80}, volume = {166}, websites = {https://doi.org/10.1016/j.cviu.2017.09.004}, publisher = {Elsevier}, id = {1a9bab7f-d212-35f9-bae9-0ad1c74cc6b3}, created = {2021-02-10T05:48:44.414Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T05:49:08.456Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d1d4f278-1b55-4513-a831-800e4d860e30}, private_publication = {false}, abstract = {More than a decade ago, the point pair features (PPFs) were introduced, showing a great potential for 3D object detection and pose estimation under very different conditions. Many modifications have been made to the original PPF, in each case showing varying degrees of improvement for specific datasets. However, to the best of our knowledge, no comprehensive evaluation of these features has been made. In this work, we evaluate PPFs on a large set of 3D scenes. We not only compare PPFs to local point cloud descriptors, but also investigate the internal variations of PPFs (different types of relations between two points). Our comparison is made on 7 publicly available datasets, showing variations on a number of parameters, e.g. acquisition technique, the number of objects/scenes and the amount of occlusion and clutter. We evaluate feature performance both at a point-wise object-scene correspondence level and for overall object detection and pose estimation in a RANSAC pipeline. Additionally, we also present object detection and pose estimation results for the original, voting based, PPF algorithm. Our results show that in general PPF is the top performer, however, there are datasets, which have low resolution data, where local histogram features show a higher performance than PPFs. We also found that PPFs compared to most local histogram features degrade faster under disturbances such as occlusion and clutter, however, PPFs still remain more descriptive on an absolute scale. The main contribution of this paper is a detailed analysis of PPFs, which highlights under which conditions PPFs perform particularly well as well as its main weaknesses.}, bibtype = {article}, author = {Kiforenko, Lilita and Drost, Bertram and Tombari, Federico and Krüger, Norbert and Glent Buch, Anders}, doi = {10.1016/j.cviu.2017.09.004}, journal = {Computer Vision and Image Understanding}, number = {November 2016} }
@article{ title = {Instance-based object recognition in 3D point clouds using discriminative shape primitives}, type = {article}, year = {2018}, keywords = {3D local shape,3D point cloud,3D pose estimation,Discriminative shape representation,Instance-based object recognition}, pages = {285-297}, volume = {29}, websites = {https://doi.org/10.1007/s00138-017-0885-8}, publisher = {Springer Berlin Heidelberg}, id = {f7b518b9-b79e-30fd-91ce-dd3180ce54f7}, created = {2021-02-10T05:48:44.536Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T05:48:53.820Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d1d4f278-1b55-4513-a831-800e4d860e30}, private_publication = {false}, abstract = {3D local shapes are a critical cue for object recognition in 3D point clouds. This paper presents an instance-based 3D object recognition method via informative and discriminative shape primitives. We propose a shape primitive model that measures geometrical informativity and discriminativity of 3D local shapes of an object. Discriminative shape primitives of the object are extracted automatically by model parameter optimization. We achieve object recognition from 2.5/3D scenes via shape primitive classification and recover the 3D poses of the identified objects simultaneously. The effectiveness and the robustness of the proposed method were verified on popular instance-based 3D object recognition datasets. The experimental results show that the proposed method outperforms some existing instance-based 3D object recognition pipelines in the presence of noise, varying resolutions, clutter and occlusion.}, bibtype = {article}, author = {Zhang, Jie and Sun, Junhua}, doi = {10.1007/s00138-017-0885-8}, journal = {Machine Vision and Applications}, number = {2} }
@article{ title = {Joint representation of primitive and non-primitive objects for 3D vision}, type = {article}, year = {2018}, keywords = {Geometric primitives,Primitive abstraction,Signed distance fields,Volumetric scanning}, pages = {160-169}, publisher = {IEEE}, id = {68d35549-99c9-34a8-93f3-66d33908b42d}, created = {2021-02-10T05:48:44.549Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T06:55:52.226Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d1d4f278-1b55-4513-a831-800e4d860e30}, private_publication = {false}, abstract = {The use of structural information in 3D scanning is becoming more and more popular. However, most approaches exploit this structural information either in the form of geometric primitives (mostly planes) or known rigid bodies, but not both. We overcome this limitation and propose an object representation that combines primitive and non-primitive objects using one unified formulation that is based on signed distance fields. Object pose manifolds are introduced to represent the rigid movement of primitives and non-primitives in a natural way. We show that different components of volumetric scanning, such as global trajectory optimization or geometry completion and denoising, benefit from our formulation.}, bibtype = {article}, author = {Sommer, Christiane and Cremers, Daniel}, doi = {10.1109/3DV.2018.00028}, journal = {Proceedings - 2018 International Conference on 3D Vision, 3DV 2018} }
@inproceedings{ title = {Learning Transferable Architectures for Scalable Image Recognition}, type = {inproceedings}, year = {2018}, pages = {8697-8710}, websites = {http://arxiv.org/abs/1707.07012}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {52af68cf-d170-3b7f-a9c9-41c5867c298a}, created = {2021-02-10T09:21:56.015Z}, accessed = {2021-02-10}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T09:22:01.846Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Developing neural network image classification models often requires significant architecture engineering. In this paper, we study a method to learn the model architectures directly on the dataset of interest. As this approach is expensive when the dataset is large, we propose to search for an architectural building block on a small dataset and then transfer the block to a larger dataset. The key contribution of this work is the design of a new search space (which we call the 'NASNet search space') which enables transferability. In our experiments, we search for the best convolutional layer (or 'cell') on the CIFAR-10 dataset and then apply this cell to the ImageNet dataset by stacking together more copies of this cell, each with their own parameters to design a convolutional architecture, which we name a 'NASNet architecture'. We also introduce a new regularization technique called ScheduledDropPath that significantly improves generalization in the NASNet models. On CIFAR-10 itself, a NASNet found by our method achieves 2.4% error rate, which is state-of-the-art. Although the cell is not searched for directly on ImageNet, a NASNet constructed from the best cell achieves, among the published works, state-of-the-art accuracy of 82.7% top-1 and 96.2% top-5 on ImageNet. Our model is 1.2% better in top-1 accuracy than the best human-invented architectures while having 9 billion fewer FLOPS - a reduction of 28% in computational demand from the previous state-of-the-art model. When evaluated at different levels of computational cost, accuracies of NASNets exceed those of the state-of-the-art human-designed models. For instance, a small version of NASNet also achieves 74% top-1 accuracy, which is 3.1% better than equivalently-sized, state-of-the-art models for mobile platforms. Finally, the image features learned from image classification are generically useful and can be transferred to other computer vision problems. On the task of object detection, the learned features by NASNet used with the Faster-RCNN framework surpass state-of-the-art by 4.0% achieving 43.1% mAP on the COCO dataset.}, bibtype = {inproceedings}, author = {Zoph, Barret and Vasudevan, Vijay and Shlens, Jonathon and Le, Quoc V.}, doi = {10.1109/CVPR.2018.00907}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {PCN: Point completion network}, type = {article}, year = {2018}, keywords = {3D reconstruction,Learning on point clouds,Point cloud registration,Shape completion}, pages = {728-737}, id = {9e69401b-1391-315b-8f6f-a93dde50847f}, created = {2021-02-24T11:29:14.413Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.005Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yuan2018}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Shape completion, the problem of estimating the complete geometry of objects from partial observations, lies at the core of many vision and robotics applications. In this work, we propose Point Completion Network (PCN), a novel learning-based approach for shape completion. Unlike existing shape completion methods, PCN directly operates on raw point clouds without any structural assumption (e.g. symmetry) or annotation (e.g. semantic class) about the underlying shape. It features a decoder design that enables the generation of fine-grained completions while maintaining a small number of parameters. Our experiments show that PCN produces dense, complete point clouds with realistic structures in the missing regions on inputs with various levels of incompleteness and noise, including cars from LiDAR scans in the KITTI dataset.}, bibtype = {article}, author = {Yuan, Wentao and Khot, Tejas and Held, David and Mertz, Christoph and Hebert, Martial}, doi = {10.1109/3DV.2018.00088}, journal = {Proceedings - 2018 International Conference on 3D Vision, 3DV 2018} }
@article{ title = {SPLATNet: Sparse Lattice Networks for Point Cloud Processing}, type = {article}, year = {2018}, pages = {2530-2539}, id = {12efbf33-fa40-3d23-8a65-eb9de295407a}, created = {2021-03-04T15:41:23.579Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-22T10:22:52.739Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {We present a network architecture for processing point clouds that directly operates on a collection of points represented as a sparse set of samples in a high-dimensional lattice. NaÃely applying convolutions on this lattice scales poorly, both in terms of memory and computational cost, as the size of the lattice increases. Instead, our network uses sparse bilateral convolutional layers as building blocks. These layers maintain efficiency by using indexing structures to apply convolutions only on occupied parts of the lattice, and allow flexible specifications of the lattice structure enabling hierarchical and spatially-aware feature learning, as well as joint 2D-3D reasoning. Both point-based and image-based representations can be easily incorporated in a network with such layers and the resulting model can be trained in an end-to-end manner. We present results on 3D segmentation tasks where our approach outperforms existing state-of-the-art techniques.}, bibtype = {article}, author = {Su, Hang and Jampani, Varun and Sun, Deqing and Maji, Subhransu and Kalogerakis, Evangelos and Yang, Ming Hsuan and Kautz, Jan}, doi = {10.1109/CVPR.2018.00268}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {PointPillars: Fast Encoders for Object Detection from Point Clouds}, type = {article}, year = {2018}, pages = {12697-12705}, websites = {http://arxiv.org/abs/1812.05784}, id = {18c67b00-609b-366e-b830-7629218bff8c}, created = {2021-03-04T15:41:23.597Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-04T15:41:25.090Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Object detection in point clouds is an important aspect of many robotics applications such as autonomous driving. In this paper we consider the problem of encoding a point cloud into a format appropriate for a downstream detection pipeline. Recent literature suggests two types of encoders; fixed encoders tend to be fast but sacrifice accuracy, while encoders that are learned from data are more accurate, but slower. In this work we propose PointPillars, a novel encoder which utilizes PointNets to learn a representation of point clouds organized in vertical columns (pillars). While the encoded features can be used with any standard 2D convolutional detection architecture, we further propose a lean downstream network. Extensive experimentation shows that PointPillars outperforms previous encoders with respect to both speed and accuracy by a large margin. Despite only using lidar, our full detection pipeline significantly outperforms the state of the art, even among fusion methods, with respect to both the 3D and bird's eye view KITTI benchmarks. This detection performance is achieved while running at 62 Hz: a 2 - 4 fold runtime improvement. A faster version of our method matches the state of the art at 105 Hz. These benchmarks suggest that PointPillars is an appropriate encoding for object detection in point clouds.}, bibtype = {article}, author = {Lang, Alex H. and Vora, Sourabh and Caesar, Holger and Zhou, Lubing and Yang, Jiong and Beijbom, Oscar} }
@article{ title = {SECOND : Sparsely Embedded}, type = {article}, year = {2018}, keywords = {3d object detection,autonomous driving,convolutional neural networks,lidar}, pages = {1-17}, id = {ae74e204-6614-3330-a85e-3e8ab618cdcd}, created = {2021-03-04T15:41:23.619Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-04T15:41:51.347Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Detection2018}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, bibtype = {article}, author = {Detection, Convolutional and Yan, Yan and Mao, Yuxing and Li, Bo}, doi = {10.3390/s18103337} }
@article{ title = {PointGrid: A Deep Network for 3D Shape Understanding}, type = {article}, year = {2018}, pages = {9204-9214}, id = {65bf09a6-8052-34f5-a65e-2b04d8debb78}, created = {2021-03-04T15:41:23.779Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-17T08:32:39.141Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Le2018}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Volumetric grid is widely used for 3D deep learning due to its regularity. However the use of relatively lower order local approximation functions such as piece-wise constant function (occupancy grid) or piece-wise linear function (distance field) to approximate 3D shape means that it needs a very high-resolution grid to represent finer geometry details, which could be memory and computationally inefficient. In this work, we propose the PointGrid, a 3D convolutional network that incorporates a constant number of points within each grid cell thus allowing the network to learn higher order local approximation functions that could better represent the local geometry shape details. With experiments on popular shape recognition benchmarks, PointGrid demonstrates state-of-the-art performance over existing deep learning methods on both classification and segmentation.}, bibtype = {article}, author = {Le, Truc and Duan, Ye}, doi = {10.1109/CVPR.2018.00959}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {SO-Net: Self-Organizing Network for Point Cloud Analysis}, type = {article}, year = {2018}, pages = {9397-9406}, id = {b68cc86c-4c08-3fb2-8c87-8fdd9bf43396}, created = {2021-03-04T15:41:23.789Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-04T15:41:46.367Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {This paper presents SO-Net, a permutation invariant architecture for deep learning with orderless point clouds. The SO-Net models the spatial distribution of point cloud by building a Self-Organizing Map (SOM). Based on the SOM, SO-Net performs hierarchical feature extraction on individual points and SOM nodes, and ultimately represents the input point cloud by a single feature vector. The receptive field of the network can be systematically adjusted by conducting point-to-node k nearest neighbor search. In recognition tasks such as point cloud reconstruction, classification, object part segmentation and shape retrieval, our proposed network demonstrates performance that is similar with or better than state-of-the-art approaches. In addition, the training speed is significantly faster than existing point cloud recognition networks because of the parallelizability and simplicity of the proposed architecture. Our code is available at the project website.1}, bibtype = {article}, author = {Li, Jiaxin and Chen, Ben M. and Lee, Gim Hee}, doi = {10.1109/CVPR.2018.00979}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {SPLATNet: Sparse Lattice Networks for Point Cloud Processing}, type = {article}, year = {2018}, pages = {3}, id = {fa8c24b7-cdec-3427-a0a8-4161ef42266d}, created = {2021-03-04T15:41:23.929Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-06T08:50:44.486Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {We present a network architecture for processing point clouds that directly operates on a collection of points represented as a sparse set of samples in a high-dimensional lattice. Naïvely applying convolutions on this lattice scales poorly, both in terms of memory and computational cost, as the size of the lattice increases. Instead, our network uses sparse bilateral convolutional layers as building blocks. These layers maintain efficiency by using indexing structures to apply convolutions only on occupied parts of the lattice, and allow flexible specifications of the lattice structure enabling hierarchical and spatially-aware feature learning, as well as joint 2D-3D reasoning. Both point-based and image-based representations can be easily incorporated in a network with such layers and the resulting model can be trained in an end-to-end manner. We present results on 3D segmentation tasks where our approach outperforms existing state-of-the-art techniques.}, bibtype = {article}, author = {Su, Hang and Jampani, Varun and Sun, Deqing and Maji, Subhransu and Kalogerakis, Evangelos and Yang, Ming Hsuan and Kautz, Jan}, journal = {arXiv} }
@article{ title = {Guided 3D point cloud filtering}, type = {article}, year = {2018}, keywords = {3D point cloud,Efficiency,Guided filtering,Noise removal}, pages = {17397-17411}, volume = {77}, publisher = {Multimedia Tools and Applications}, id = {758da490-b33c-368e-82cc-70eabd100c33}, created = {2021-03-08T09:43:04.129Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-09T06:38:49.505Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {3D point cloud has gained significant attention in recent years. However, raw point clouds captured by 3D sensors are unavoidably contaminated with noise resulting in detrimental efforts on the practical applications. Although many widely used point cloud filters such as normal-based bilateral filter, can produce results as expected, they require a higher running time. Therefore, inspired by guided image filter, this paper takes the position information of the point into account to derive the linear model with respect to guidance point cloud and filtered point cloud. Experimental results show that the proposed algorithm, which can successfully remove the undesirable noise while offering better performance in feature-preserving, is significantly superior to several state-of-the-art methods, particularly in terms of efficiency.}, bibtype = {article}, author = {Han, Xian Feng and Jin, Jesse S. and Wang, Ming Jie and Jiang, Wei}, doi = {10.1007/s11042-017-5310-9}, journal = {Multimedia Tools and Applications}, number = {13} }
@article{ title = {US010097813B2}, type = {article}, year = {2018}, volume = {2}, id = {9306a990-1c72-3d3a-ba64-e27171bdbf38}, created = {2021-04-08T11:26:53.393Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T08:24:37.945Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Maki, Atsuto and Gb, Cambridge and Gb, Cambridge and Gb, Cambridge and Gb, Cambridge} }
@article{ title = {US009875554B2}, type = {article}, year = {2018}, volume = {2}, id = {fa0a7b62-28c0-3bda-92b1-206c79f64e14}, created = {2021-04-14T07:42:10.248Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:11.191Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Giacobbe, Inventors Robert A and Mountain, Stone} }
@article{ title = {US20180025496A1}, type = {article}, year = {2018}, volume = {1}, id = {d18a29b8-e660-38ca-9ae3-c73a41e136ed}, created = {2021-04-15T08:24:37.785Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:11.083Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Improved, Methods F O R} }
@article{ title = {DDRNet: Depth map denoising and refinement for consumer depth cameras using cascaded CNNs}, type = {article}, year = {2018}, keywords = {Consumer depth camera,Convolutional neural networks,Depth enhancement,DynamicFusion,Unsupervised learning}, pages = {155-171}, volume = {11214 LNCS}, id = {0cb7640e-3136-3aca-96ae-b90bcb50ac1e}, created = {2021-04-26T07:46:35.542Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-21T08:44:26.819Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Consumer depth sensors are more and more popular and come to our daily lives marked by its recent integration in the latest Iphone X. However, they still suffer from heavy noises which limit their applications. Although plenty of progresses have been made to reduce the noises and boost geometric details, due to the inherent illness and the real-time requirement, the problem is still far from been solved. We propose a cascaded Depth Denoising and Refinement Network (DDRNet) to tackle this problem by leveraging the multi-frame fused geometry and the accompanying high quality color image through a joint training strategy. The rendering equation is exploited in our network in an unsupervised manner. In detail, we impose an unsupervised loss based on the light transport to extract the high-frequency geometry. Experimental results indicate that our network achieves real-time single depth enhancement on various categories of scenes. Thanks to the well decoupling of the low and high frequency information in the cascaded network, we achieve superior performance over the state-of-the-art techniques.}, bibtype = {article}, author = {Yan, Shi and Wu, Chenglei and Wang, Lizhen and Xu, Feng and An, Liang and Guo, Kaiwen and Liu, Yebin}, doi = {10.1007/978-3-030-01249-6_10}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {DDRNet: Depth map denoising and refinement for consumer depth cameras using cascaded CNNs}, type = {article}, year = {2018}, keywords = {Consumer depth camera,Convolutional neural networks,Depth enhancement,DynamicFusion,Unsupervised learning}, pages = {155-171}, volume = {11214 LNCS}, id = {4494e834-54e1-35c4-bcdb-2749611f670b}, created = {2021-04-26T07:46:58.446Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-26T07:47:01.958Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Consumer depth sensors are more and more popular and come to our daily lives marked by its recent integration in the latest Iphone X. However, they still suffer from heavy noises which limit their applications. Although plenty of progresses have been made to reduce the noises and boost geometric details, due to the inherent illness and the real-time requirement, the problem is still far from been solved. We propose a cascaded Depth Denoising and Refinement Network (DDRNet) to tackle this problem by leveraging the multi-frame fused geometry and the accompanying high quality color image through a joint training strategy. The rendering equation is exploited in our network in an unsupervised manner. In detail, we impose an unsupervised loss based on the light transport to extract the high-frequency geometry. Experimental results indicate that our network achieves real-time single depth enhancement on various categories of scenes. Thanks to the well decoupling of the low and high frequency information in the cascaded network, we achieve superior performance over the state-of-the-art techniques.}, bibtype = {article}, author = {Yan, Shi and Wu, Chenglei and Wang, Lizhen and Xu, Feng and An, Liang and Guo, Kaiwen and Liu, Yebin}, doi = {10.1007/978-3-030-01249-6_10}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Towards efficient large-scale graph neural network computing}, type = {article}, year = {2018}, id = {3e6586ff-0e35-3e65-81fe-3a0491e0cad6}, created = {2021-05-06T06:36:58.068Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-05-06T06:37:03.082Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {368c6572-df92-4840-8400-80e7c9ee2dd7}, private_publication = {false}, abstract = {Recent deep learning models have moved beyond low-dimensional regular grids such as image, video, and speech, to high-dimensional graph-structured data, such as social networks, brain connections, and knowledge graphs. This evolution has led to large graph-based irregular and sparse models that go beyond what existing deep learning frameworks are designed for. Further, these models are not easily amenable to efficient, at scale, acceleration on parallel hardwares (e.g. GPUs). We introduce NGra, the first parallel processing framework for graph-based deep neural networks (GNNs). NGra presents a new SAGA-NN model for expressing deep neural networks as vertex programs with each layer in well-defined (Scatter, ApplyEdge, Gather, ApplyVertex) graph operation stages. This model not only allows GNNs to be expressed intuitively, but also facilitates the mapping to an efficient dataflow representation. NGra addresses the scalability challenge transparently through automatic graph partitioning and chunk-based stream processing out of GPU core or over multiple GPUs, which carefully considers data locality, data movement, and overlapping of parallel processing and data movement. NGra further achieves efficiency through highly optimized Scatter/Gather operators on GPUs despite its sparsity. Our evaluation shows that NGra scales to large real graphs that none of the existing frameworks can handle directly, while achieving up to about 4 times speedup even at small scales over the multiple-baseline design on TensorFlow.}, bibtype = {article}, author = {Ma, Lingxiao and Yang, Zhi and Miao, Youshan and Xue, Jilong and Wu, Ming and Zhou, Lidong and Dai, Yafei}, journal = {arXiv} }
@article{ title = {A Graph-CNN for 3D Point Cloud Classification}, type = {article}, year = {2018}, keywords = {3D point cloud data,Graph convolutional neural networks,Graph signal processing,Supervised learning}, pages = {6279-6283}, volume = {2018-April}, id = {594162a9-66a2-3204-9d6f-a2daeed55365}, created = {2021-05-06T06:36:58.071Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-05-06T06:37:05.734Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {368c6572-df92-4840-8400-80e7c9ee2dd7}, private_publication = {false}, abstract = {Graph convolutional neural networks (Graph-CNNs) extend traditional CNNs to handle data that is supported on a graph. Major challenges when working with data on graphs are that the support set (the vertices of the graph) do not typically have a natural ordering, and in general, the topology of the graph is not regular (i.e., vertices do not all have the same number of neighbors). Thus, Graph-CNNs have huge potential to deal with 3D point cloud data which has been obtained from sampling a manifold. In this paper we develop a Graph-CNN for classifying 3D point cloud data, called PointGCN1. The architecture combines localized graph convolutions with two types of graph downsampling operations (also known as pooling). By the effective exploration of the point cloud local structure using the Graph-CNN, the proposed architecture achieves competitive performance on the 3D object classification benchmark ModelNet, and our architecture is more stable than competing schemes.}, bibtype = {article}, author = {Zhang, Yingxue and Rabbat, Michael}, doi = {10.1109/ICASSP.2018.8462291}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {AMC: AutoML for Model Compression and Acceleration on Mobile Devices}, type = {article}, year = {2018}, keywords = {AutoML,CNN acceleration,Mobile vision,Model compression,Reinforcement learning}, pages = {815-832}, volume = {11211 LNCS}, websites = {http://arxiv.org/abs/1802.03494}, month = {2}, publisher = {Springer Verlag}, day = {9}, id = {bd7f4d6a-c65a-3a76-80d5-f57fdb548708}, created = {2021-06-14T08:17:39.000Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:31:21.494Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4,cf9189f6-f354-4337-8aaf-a5f12cbf8660}, private_publication = {false}, abstract = {Model compression is a critical technique to efficiently deploy neural network models on mobile devices which have limited computation resources and tight power budgets. Conventional model compression techniques rely on hand-crafted heuristics and rule-based policies that require domain experts to explore the large design space trading off among model size, speed, and accuracy, which is usually sub-optimal and time-consuming. In this paper, we propose AutoML for Model Compression (AMC) which leverage reinforcement learning to provide the model compression policy. This learning-based compression policy outperforms conventional rule-based compression policy by having higher compression ratio, better preserving the accuracy and freeing human labor. Under 4x FLOPs reduction, we achieved 2.7% better accuracy than the handcrafted model compression policy for VGG-16 on ImageNet. We applied this automated, push-the-button compression pipeline to MobileNet and achieved 1.81x speedup of measured inference latency on an Android phone and 1.43x speedup on the Titan XP GPU, with only 0.1% loss of ImageNet Top-1 accuracy.}, bibtype = {article}, author = {He, Yihui and Lin, Ji and Liu, Zhijian and Wang, Hanrui and Li, Li-Jia and Han, Song}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {AMC: AutoML for Model Compression and Acceleration on Mobile Devices}, type = {article}, year = {2018}, keywords = {AutoML,CNN acceleration,Mobile vision,Model compression,Reinforcement learning}, pages = {815-832}, volume = {11211 LNCS}, websites = {http://arxiv.org/abs/1802.03494}, month = {2}, publisher = {Springer Verlag}, day = {9}, id = {776795ae-7acc-3de6-b89a-475b53c087aa}, created = {2021-06-14T08:25:32.229Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:25:54.236Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Model compression is a critical technique to efficiently deploy neural network models on mobile devices which have limited computation resources and tight power budgets. Conventional model compression techniques rely on hand-crafted heuristics and rule-based policies that require domain experts to explore the large design space trading off among model size, speed, and accuracy, which is usually sub-optimal and time-consuming. In this paper, we propose AutoML for Model Compression (AMC) which leverage reinforcement learning to provide the model compression policy. This learning-based compression policy outperforms conventional rule-based compression policy by having higher compression ratio, better preserving the accuracy and freeing human labor. Under 4x FLOPs reduction, we achieved 2.7% better accuracy than the handcrafted model compression policy for VGG-16 on ImageNet. We applied this automated, push-the-button compression pipeline to MobileNet and achieved 1.81x speedup of measured inference latency on an Android phone and 1.43x speedup on the Titan XP GPU, with only 0.1% loss of ImageNet Top-1 accuracy.}, bibtype = {article}, author = {He, Yihui and Lin, Ji and Liu, Zhijian and Wang, Hanrui and Li, Li-Jia and Han, Song}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {HAQ: Hardware-Aware Automated Quantization with Mixed Precision}, type = {article}, year = {2018}, keywords = {Deep Learning,Vision Applications and Systems}, pages = {8604-8612}, volume = {2019-June}, websites = {http://arxiv.org/abs/1811.08886}, month = {11}, publisher = {IEEE Computer Society}, day = {21}, id = {9bc31bbb-9410-3992-a5c3-ec0d56683a98}, created = {2021-06-14T08:28:10.433Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:31:46.154Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4,cf9189f6-f354-4337-8aaf-a5f12cbf8660}, private_publication = {false}, abstract = {Model quantization is a widely used technique to compress and accelerate deep neural network (DNN) inference. Emergent DNN hardware accelerators begin to support mixed precision (1-8 bits) to further improve the computation efficiency, which raises a great challenge to find the optimal bitwidth for each layer: it requires domain experts to explore the vast design space trading off among accuracy, latency, energy, and model size, which is both time-consuming and sub-optimal. Conventional quantization algorithm ignores the different hardware architectures and quantizes all the layers in a uniform way. In this paper, we introduce the Hardware-Aware Automated Quantization (HAQ) framework which leverages the reinforcement learning to automatically determine the quantization policy, and we take the hardware accelerator's feedback in the design loop. Rather than relying on proxy signals such as FLOPs and model size, we employ a hardware simulator to generate direct feedback signals (latency and energy) to the RL agent. Compared with conventional methods, our framework is fully automated and can specialize the quantization policy for different neural network architectures and hardware architectures. Our framework effectively reduced the latency by 1.4-1.95x and the energy consumption by 1.9x with negligible loss of accuracy compared with the fixed bitwidth (8 bits) quantization. Our framework reveals that the optimal policies on different hardware architectures (i.e., edge and cloud architectures) under different resource constraints (i.e., latency, energy and model size) are drastically different. We interpreted the implication of different quantization policies, which offer insights for both neural network architecture design and hardware architecture design.}, bibtype = {article}, author = {Wang, Kuan and Liu, Zhijian and Lin, Yujun and Lin, Ji and Han, Song}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {Shufflenet V2: Practical guidelines for efficient cnn architecture design}, type = {inproceedings}, year = {2018}, keywords = {CNN architecture design,Efficiency,Practical}, pages = {122-138}, volume = {11218 LNCS}, websites = {http://arxiv.org/abs/1807.11164}, month = {7}, publisher = {Springer Verlag}, day = {30}, id = {f2d0fcc4-1e87-392d-807b-7d1a991a89f4}, created = {2021-06-14T08:55:33.975Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:55:43.502Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {Currently, the neural network architecture design is mostly guided by the indirect metric of computation complexity, i.e., FLOPs. However, the direct metric, e.g., speed, also depends on the other factors such as memory access cost and platform characterics. Thus, this work proposes to evaluate the direct metric on the target platform, beyond only considering FLOPs. Based on a series of controlled experiments, this work derives several practical guidelines for efficient network design. Accordingly, a new architecture is presented, called ShuffleNet V2. Comprehensive ablation experiments verify that our model is the state-of-the-art in terms of speed and accuracy tradeoff.}, bibtype = {inproceedings}, author = {Ma, Ningning and Zhang, Xiangyu and Zheng, Hai Tao and Sun, Jian}, doi = {10.1007/978-3-030-01264-9_8}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {MobileNetV2: Inverted Residuals and Linear Bottlenecks}, type = {inproceedings}, year = {2018}, pages = {4510-4520}, websites = {http://arxiv.org/abs/1801.04381}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {ab0533f8-0557-3578-a126-3de8b05818d1}, created = {2021-06-14T08:56:17.690Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:56:20.994Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3. is based on an inverted residual structure where the shortcut connections are between the thin bottleneck layers. The intermediate expansion layer uses lightweight depthwise convolutions to filter features as a source of non-linearity. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on ImageNet [1] classification, COCO object detection [2], VOC image segmentation [3]. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as actual latency, and the number of parameters.}, bibtype = {inproceedings}, author = {Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang Chieh}, doi = {10.1109/CVPR.2018.00474}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices}, type = {inproceedings}, year = {2018}, pages = {6848-6856}, websites = {https://arxiv.org/abs/1707.01083v2}, month = {12}, publisher = {IEEE Computer Society}, day = {14}, id = {ed1b5432-093e-3172-a281-f093b6c23ea7}, created = {2021-06-14T08:57:06.880Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:57:19.306Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {We introduce an extremely computation-efficient CNN architecture named ShuffleNet, which is designed specially for mobile devices with very limited computing power (e.g., 10-150 MFLOPs). The new architecture utilizes two new operations, pointwise group convolution and channel shuffle, to greatly reduce computation cost while maintaining accuracy. Experiments on ImageNet classification and MS COCO object detection demonstrate the superior performance of ShuffleNet over other structures, e.g. lower top-1 error (absolute 7.8%) than recent MobileNet [12] on ImageNet classification task, under the computation budget of 40 MFLOPs. On an ARM-based mobile device, ShuffleNet achieves ~13Ã - actual speedup over AlexNet while maintaining comparable accuracy.}, bibtype = {inproceedings}, author = {Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian}, doi = {10.1109/CVPR.2018.00716}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Graph attention networks}, type = {article}, year = {2018}, pages = {1-12}, id = {81a8cabd-ed14-3b3c-8a95-154ac9166ad7}, created = {2021-06-21T08:44:26.404Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:23.651Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8,20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7,597192a3-7679-4832-a554-980990d8ac9b,d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {We present graph attention networks (GATs), novel neural network architectures that operate on graph-structured data, leveraging masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods’ features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of computationally intensive matrix operation (such as inversion) or depending on knowing the graph structure upfront. In this way, we address several key challenges of spectral-based graph neural networks simultaneously, and make our model readily applicable to inductive as well as transductive problems. Our GAT models have achieved or matched state-of-the-art results across four established transductive and inductive graph benchmarks: the Cora, Citeseer and Pubmed citation network datasets, as well as a protein-protein interaction dataset (wherein test graphs remain unseen during training).}, bibtype = {article}, author = {Veličković, Petar and Casanova, Arantxa and Liò, Pietro and Cucurull, Guillem and Romero, Adriana and Bengio, Yoshua}, journal = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings} }
@article{ title = {Iterative Transformer Network for 3D Point Cloud}, type = {article}, year = {2018}, websites = {http://arxiv.org/abs/1811.11209}, id = {148ccf7b-d29c-3f63-b81e-1e988d3bed59}, created = {2021-07-01T07:40:22.877Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-01T07:40:27.399Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {3D point cloud is an efficient and flexible representation of 3D structures. Recently, neural networks operating on point clouds have shown superior performance on 3D understanding tasks such as shape classification and part segmentation. However, performance on such tasks is evaluated on complete shapes aligned in a canonical frame, while real world 3D data are partial and unaligned. A key challenge in learning from partial, unaligned point cloud data is to learn features that are invariant or equivariant with respect to geometric transformations. To address this challenge, we propose the Iterative Transformer Network (IT-Net), a network module that canonicalizes the pose of a partial object with a series of 3D rigid transformations predicted in an iterative fashion. We demonstrate the efficacy of IT-Net as an anytime pose estimator from partial point clouds without using complete object models. Further, we show that IT-Net achieves superior performance over alternative 3D transformer networks on various tasks, such as partial shape classification and object part segmentation.}, bibtype = {article}, author = {Yuan, Wentao and Held, David and Mertz, Christoph and Hebert, Martial} }
@article{ title = {DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs}, type = {article}, year = {2018}, keywords = {Convolutional neural networks,atrous convolution,conditional random fields,semantic segmentation}, pages = {834-848}, volume = {40}, id = {da5e1349-f3ed-3267-8bcd-85dfee5c2c6e}, created = {2021-07-01T07:40:23.000Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-01T07:40:29.854Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {In this work we address the task of semantic image segmentation with Deep Learning and make three main contributions that are experimentally shown to have substantial practical merit. First, we highlight convolution with upsampled filters, or 'atrous convolution', as a powerful tool in dense prediction tasks. Atrous convolution allows us to explicitly control the resolution at which feature responses are computed within Deep Convolutional Neural Networks. It also allows us to effectively enlarge the field of view of filters to incorporate larger context without increasing the number of parameters or the amount of computation. Second, we propose atrous spatial pyramid pooling (ASPP) to robustly segment objects at multiple scales. ASPP probes an incoming convolutional feature layer with filters at multiple sampling rates and effective fields-of-views, thus capturing objects as well as image context at multiple scales. Third, we improve the localization of object boundaries by combining methods from DCNNs and probabilistic graphical models. The commonly deployed combination of max-pooling and downsampling in DCNNs achieves invariance but has a toll on localization accuracy. We overcome this by combining the responses at the final DCNN layer with a fully connected Conditional Random Field (CRF), which is shown both qualitatively and quantitatively to improve localization performance. Our proposed 'DeepLab' system sets the new state-of-art at the PASCAL VOC-2012 semantic image segmentation task, reaching 79.7 percent mIOU in the test set, and advances the results on three other datasets: PASCAL-Context, PASCAL-Person-Part, and Cityscapes. All of our code is made publicly available online.}, bibtype = {article}, author = {Chen, Liang Chieh and Papandreou, George and Kokkinos, Iasonas and Murphy, Kevin and Yuille, Alan L.}, doi = {10.1109/TPAMI.2017.2699184}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {4} }
@article{ title = {GaAN : Gated Attention Networks for Learning on Large and Spatiotemporal Graphs}, type = {article}, year = {2018}, id = {4944a453-67a6-3e0a-824a-2cb9a64bcc90}, created = {2021-07-12T09:25:31.791Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-17T05:17:00.606Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48e9a80d-67a5-450d-9b08-c7bc934154e8,20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {article}, author = {Zhang, Jiani and Shi, Xingjian and Chinese, The and Kong, Hong and Kong, Hong} }
@article{ title = {Neural tangent kernel: Convergence and generalization in neural networks}, type = {article}, year = {2018}, pages = {8571-8580}, volume = {2018-Decem}, id = {e9e41cfb-ace5-3c18-8322-7cf1f445ca7e}, created = {2021-07-12T14:15:35.522Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:45.676Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {At initialization, artificial neural networks (ANNs) are equivalent to Gaussian processes in the infinite-width limit (12; 9), thus connecting them to kernel methods. We prove that the evolution of an ANN during training can also be described by a kernel: during gradient descent on the parameters of an ANN, the network function fθ (which maps input vectors to output vectors) follows the kernel gradient of the functional cost (which is convex, in contrast to the parameter cost) w.r.t. a new kernel: the Neural Tangent Kernel (NTK). This kernel is central to describe the generalization features of ANNs. While the NTK is random at initialization and varies during training, in the infinite-width limit it converges to an explicit limiting kernel and it stays constant during training. This makes it possible to study the training of ANNs in function space instead of parameter space. Convergence of the training can then be related to the positive-definiteness of the limiting NTK. We then focus on the setting of least-squares regression and show that in the infinite-width limit, the network function fθ follows a linear differential equation during training. The convergence is fastest along the largest kernel principal components of the input data with respect to the NTK, hence suggesting a theoretical motivation for early stopping. Finally we study the NTK numerically, observe its behavior for wide networks, and compare it to the infinite-width limit.}, bibtype = {article}, author = {Jacot, Arthur and Gabriel, Franck and Hongler, Clément}, journal = {Advances in Neural Information Processing Systems}, number = {5} }
@article{ title = {Loss surfaces, mode connectivity, and fast ensembling of DNNs}, type = {article}, year = {2018}, pages = {8789-8798}, volume = {2018-Decem}, id = {091c705b-ce73-3839-9698-d1ced5f5c6c8}, created = {2021-07-12T14:15:35.851Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:46.848Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {The loss functions of deep neural networks are complex and their geometric properties are not well understood. We show that the optima of these complex loss functions are in fact connected by simple curves over which training and test accuracy are nearly constant. We introduce a training procedure to discover these high-accuracy pathways between modes. Inspired by this new geometric insight, we also propose a new ensembling method entitled Fast Geometric Ensembling (FGE). Using FGE we can train high-performing ensembles in the time required to train a single model. We achieve improved performance compared to the recent state-of-the-art Snapshot Ensembles, on CIFAR-10, CIFAR-100, and ImageNet.}, bibtype = {article}, author = {Garipov, Timur and Izmailov, Pavel and Podoprikhin, Dmitrii and Vetrov, Dmitry and Wilson, Andrew Gordon}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {PU-Net: Point Cloud Upsampling Network}, type = {article}, year = {2018}, pages = {2790-2799}, id = {379b4a41-da4d-3eac-8776-bd71c56c532a}, created = {2021-07-21T12:34:44.839Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.296Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yu2018}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Learning and analyzing 3D point clouds with deep networks is challenging due to the sparseness and irregularity of the data. In this paper, we present a data-driven point cloud upsampling technique. The key idea is to learn multi-level features per point and expand the point set via a multi-branch convolution unit implicitly in feature space. The expanded feature is then split to a multitude of features, which are then reconstructed to an upsampled point set. Our network is applied at a patch-level, with a joint loss function that encourages the upsampled points to remain on the underlying surface with a uniform distribution. We conduct various experiments using synthesis and scan data to evaluate our method and demonstrate its superiority over some baseline methods and an optimization-based method. Results show that our upsampled points have better uniformity and are located closer to the underlying surfaces.}, bibtype = {article}, author = {Yu, Lequan and Li, Xianzhi and Fu, Chi Wing and Cohen-Or, Daniel and Heng, Pheng Ann}, doi = {10.1109/CVPR.2018.00295}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Local Spectral Graph Convolution for Point Set Feature Learning}, type = {article}, year = {2018}, keywords = {Clustering,Deep learning,Graph convolution,Point set features,Spectral coordinates,Spectral filtering}, pages = {56-71}, volume = {11208 LNCS}, id = {90c3925a-0f49-3e4b-894d-8dceab653f25}, created = {2021-08-04T09:51:19.993Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.644Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2018}, private_publication = {false}, abstract = {Feature learning on point clouds has shown great promise, with the introduction of effective and generalizable deep learning frameworks such as pointnet++. Thus far, however, point features have been abstracted in an independent and isolated manner, ignoring the relative layout of neighboring points as well as their features. In the present article, we propose to overcome this limitation by using spectral graph convolution on a local graph, combined with a novel graph pooling strategy. In our approach, graph convolution is carried out on a nearest neighbor graph constructed from a point’s neighborhood, such that features are jointly learned. We replace the standard max pooling step with a recursive clustering and pooling strategy, devised to aggregate information from within clusters of nodes that are close to one another in their spectral coordinates, leading to richer overall feature descriptors. Through extensive experiments on diverse datasets, we show a consistent demonstrable advantage for the tasks of both point set classification and segmentation. Our implementations are available at https://github.com/fate3439/LocalSpecGCN.}, bibtype = {article}, author = {Wang, Chu and Samari, Babak and Siddiqi, Kaleem}, doi = {10.1007/978-3-030-01225-0_4}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {RGCNN: Regularized graph Cnn for point cloud segmentation}, type = {article}, year = {2018}, keywords = {Graph CNN,Graph-signal smoothness prior,Point cloud segmentation,Updated graph Laplacian}, pages = {746-754}, id = {c85e9855-b18a-3bbd-82d3-2de6b5c92c56}, created = {2021-08-04T13:05:08.030Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-23T08:37:26.076Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Te2018}, private_publication = {false}, abstract = {Point cloud, an efficient 3D object representation, has become popular with the development of depth sensing and 3D laser scanning techniques. It has attracted attention in various applications such as 3D tele-presence, navigation for unmanned vehicles and heritage reconstruction. The understanding of point clouds, such as point cloud segmentation, is crucial in exploiting the informative value of point clouds for such applications. Due to the irregularity of the data format, previous deep learning works often convert point clouds to regular 3D voxel grids or collections of images before feeding them into neural networks, which leads to voluminous data and quantization artifacts. In this paper, we instead propose a regularized graph convolutional neural network (RGCNN) that directly consumes point clouds. Leveraging on spectral graph theory, we treat features of points in a point cloud as signals on graph, and define the convolution over graph by Chebyshev polynomial approximation. In particular, we update the graph Laplacian matrix that describes the connectivity of features in each layer according to the corresponding learned features, which adaptively captures the structure of dynamic graphs. Further, we deploy a graph-signal smoothness prior in the loss function, thus regularizing the learning process. Experimental results on the ShapeNet part dataset show that the proposed approach significantly reduces the computational complexity while achieving competitive performance with the state of the art. Also, experiments show RGCNN is much more robust to both noise and point cloud density in comparison with other methods. We further apply RGCNN to point cloud classification and achieve competitive results on ModelNet40 dataset.}, bibtype = {article}, author = {Te, Gusi and Zheng, Amin and Hu, Wei and Guo, Zongming}, doi = {10.1145/3240508.3240621}, journal = {MM 2018 - Proceedings of the 2018 ACM Multimedia Conference} }
@article{ title = {MOTIFNET: A MOTIF-BASED GRAPH CONVOLUTIONAL NETWORK for DIRECTED GRAPHS}, type = {article}, year = {2018}, keywords = {Directed Graphs,Geometric Deep Learning,Graph Convolutional Neural Networks,Graph Motifs}, pages = {225-228}, publisher = {IEEE}, id = {1b6f5ff7-f222-3b72-b647-01d0de82afbd}, created = {2021-08-04T13:05:08.164Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:23.275Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Deep learning on graphs and in particular, graph convolutional neural networks, have recently attracted significant attention in the machine learning community. Many of such techniques explore the analogy between the graph Laplacian eigenvectors and the classical Fourier basis, allowing to formulate the convolution as a multiplication in the spectral domain. One of the key drawback of spectral CNNs is their explicit assumption of an undirected graph, leading to a symmetric Laplacian matrix with orthogonal eigendecomposition. In this work we propose MotifNet, a graph CNN capable of dealing with directed graphs by exploiting local graph motifs. We present experimental evidence showing the advantage of our approach on real data.}, bibtype = {article}, author = {Monti, Federico and Otness, Karl and Bronstein, Michael M.}, doi = {10.1109/DSW.2018.8439897}, journal = {2018 IEEE Data Science Workshop, DSW 2018 - Proceedings}, number = {1} }
@article{ title = {Spectral Multigraph Networks for Discovering and Fusing Relationships in Molecules}, type = {article}, year = {2018}, pages = {1-11}, websites = {http://arxiv.org/abs/1811.09595}, id = {5fd66d0c-7397-34ae-9cff-9f469a6ed809}, created = {2021-08-04T13:05:08.165Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-04T13:05:35.698Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Spectral Graph Convolutional Networks (GCNs) are a generalization of convolutional networks to learning on graph-structured data. Applications of spectral GCNs have been successful, but limited to a few problems where the graph is fixed, such as shape correspondence and node classification. In this work, we address this limitation by revisiting a particular family of spectral graph networks, Chebyshev GCNs, showing its efficacy in solving graph classification tasks with a variable graph structure and size. Chebyshev GCNs restrict graphs to have at most one edge between any pair of nodes. To this end, we propose a novel multigraph network that learns from multi-relational graphs. We model learned edges with abstract meaning and experiment with different ways to fuse the representations extracted from annotated and learned edges, achieving competitive results on a variety of chemical classification benchmarks.}, bibtype = {article}, author = {Knyazev, Boris and Lin, Xiao and Amer, Mohamed R. and Taylor, Graham W.} }
@article{ title = {A Graph-CNN for 3D Point Cloud Classification}, type = {article}, year = {2018}, keywords = {3D point cloud data,Graph convolutional neural networks,Graph signal processing,Supervised learning}, pages = {6279-6283}, volume = {2018-April}, publisher = {IEEE}, id = {c44f676b-2b33-377a-af9f-7c871a0a3ddc}, created = {2021-08-18T13:14:09.644Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.667Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhang2018}, private_publication = {false}, abstract = {Graph convolutional neural networks (Graph-CNNs) extend traditional CNNs to handle data that is supported on a graph. Major challenges when working with data on graphs are that the support set (the vertices of the graph) do not typically have a natural ordering, and in general, the topology of the graph is not regular (i.e., vertices do not all have the same number of neighbors). Thus, Graph-CNNs have huge potential to deal with 3D point cloud data which has been obtained from sampling a manifold. In this paper we develop a Graph-CNN for classifying 3D point cloud data, called PointGCN1. The architecture combines localized graph convolutions with two types of graph downsampling operations (also known as pooling). By the effective exploration of the point cloud local structure using the Graph-CNN, the proposed architecture achieves competitive performance on the 3D object classification benchmark ModelNet, and our architecture is more stable than competing schemes.}, bibtype = {article}, author = {Zhang, Yingxue and Rabbat, Michael}, doi = {10.1109/ICASSP.2018.8462291}, journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings} }
@article{ title = {Few-shot learning with graph neural networks}, type = {article}, year = {2018}, pages = {1-13}, id = {53c0ffe4-795a-3ee8-b96e-25eadae847ce}, created = {2021-08-20T10:21:49.210Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:24.527Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We propose to study the problem of few-shot learning with the prism of inference on a partially observed graphical model, constructed from a collection of input images whose label can be either observed or not. By assimilating generic message-passing inference algorithms with their neural-network counterparts, we define a graph neural network architecture that generalizes several of the recently proposed few-shot learning models. Besides providing improved numerical performance, our framework is easily extended to variants of few-shot learning, such as semi-supervised or active learning, demonstrating the ability of graph-based models to operate well on ‘relational’ tasks.}, bibtype = {article}, author = {Garcia, Victor and Bruna, Joan}, journal = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings} }
@article{ title = {Fast resampling of three-dimensional point clouds via graphs}, type = {article}, year = {2018}, keywords = {3D point clouds,Contour detection,Graph filtering,Graph signal processing,Registration,Sampling,Shape modeling,Visualization}, pages = {666-681}, volume = {66}, id = {78056046-ca15-3a46-af30-f76456ed2497}, created = {2021-08-29T22:27:22.077Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-15T10:39:20.705Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {To reduce the cost of storing, processing, and visualizing a large-scale point cloud, we propose a randomized resampling strategy that selects a representative subset of points while preserving application-dependent features. The strategy is based on graphs, which can represent underlying surfaces and lend themselves well to efficient computation. We use a general feature-extraction operator to represent application-dependent features and propose a general reconstruction error to evaluate the quality of resampling; by minimizing the error, we obtain a general form of optimal resampling distribution. The proposed resampling distribution is guaranteed to be shift-, rotation- and scale-invariant in the three-dimensional space. We then specify the feature-extraction operator to be a graph filter and study specific resampling strategies based on all-pass, low-pass, high-pass graph filtering and graph filter banks. We validate the proposed methods on three applications: Large-scale visualization, accurate registration, and robust shape modeling demonstrating the effectiveness and efficiency of the proposed resampling methods.}, bibtype = {article}, author = {Chen, Siheng and Tian, Dong and Feng, Chen and Vetro, Anthony and Kovačević, Jelena}, doi = {10.1109/TSP.2017.2771730}, journal = {IEEE Transactions on Signal Processing}, number = {3} }
@article{ title = {SpiderCNN: Deep learning on point sets with parameterized convolutional filters}, type = {article}, year = {2018}, keywords = {Convolutional neural network,Parametrized convolutional filters,Point clouds}, pages = {90-105}, volume = {11212 LNCS}, id = {e9cd7f85-f553-338e-be17-367c08b9833b}, created = {2021-08-29T22:46:33.384Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-12T13:34:19.200Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Deep neural networks have enjoyed remarkable success for various vision tasks, however it remains challenging to apply CNNs to domains lacking a regular underlying structures such as 3D point clouds. Towards this we propose a novel convolutional architecture, termed SpiderCNN, to efficiently extract geometric features from point clouds. SpiderCNN is comprised of units called SpiderConv, which extend convolutional operations from regular grids to irregular point sets that can be embedded in Rn, by parametrizing a family of convolutional filters. We design the filter as a product of a simple step function that captures local geodesic information and a Taylor polynomial that ensures the expressiveness. SpiderCNN inherits the multi-scale hierarchical architecture from classical CNNs, which allows it to extract semantic deep features. Experiments on ModelNet40 demonstrate that SpiderCNN achieves state-of-the-art accuracy 92.4% on standard benchmarks, and shows competitive performance on segmentation task.}, bibtype = {article}, author = {Xu, Yifan and Fan, Tianqi and Xu, Mingye and Zeng, Long and Qiao, Yu}, doi = {10.1007/978-3-030-01237-3_6}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Variational Autoencoders: A Brief Survey}, type = {article}, year = {2018}, pages = {1-9}, id = {c8184d19-010a-346e-98cb-015aa133bda8}, created = {2021-09-02T05:25:53.247Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:08.448Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Damer2018}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8ab28090-4ad8-4e96-920f-bd4344219976}, private_publication = {false}, abstract = {Face morphing attacks aim at creating face images that are verifiable to be the face of multiple identities, which can lead to building faulty identity links in operations like border crossing. Research has been focused on creating more accurate attack detection approaches by considering different image properties. However, all the attacks considered so far are based on manipulating facial landmarks localized in the morphed face images. In contrast, this work presents novel face morphing attacks based on image generated by generative adversarial networks. We present the MorGAN structure that considers the representation loss to successfully create realistic morphing attacks. Based on that, we present a novel face morphing attacks database (MorGAN database) that contains 1000 morph images for both, the proposed MorGAN and landmark-based attacks. We present vulnerability analysis of two face recognition approaches facing the proposed attacks. Moreover, the detectability of the proposed MorGAN attacks is studied, in the scenarios where this type of attacks is know and un- known. We concluded with pointing out the challenge of detecting such unknown novel attacks and an analysis of detection performances of different features in detecting such attacks.}, bibtype = {article}, author = {Mittal, Mayank and Behl, Harkirat}, journal = {IEEE 9th International Conference on Biometrics Theory, Applications and Systems}, number = {14376} }
@article{ title = {3DTI-Net: Learn Inner Transform Invariant 3D Geometry Features using Dynamic GCN}, type = {article}, year = {2018}, websites = {http://arxiv.org/abs/1812.06254}, id = {7c14e0c1-d4e8-37f2-bdf5-ebb6c96bad86}, created = {2021-09-02T06:33:40.752Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-15T10:39:20.722Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Pan2018}, private_publication = {false}, abstract = {Deep learning on point clouds has made a lot of progress recently. Many point cloud dedicated deep learning frameworks, such as PointNet and PointNet++, have shown advantages in accuracy and speed comparing to those using traditional 3D convolution algorithms. However, nearly all of these methods face a challenge, since the coordinates of the point cloud are decided by the coordinate system, they cannot handle the problem of 3D transform invariance properly. In this paper, we propose a general framework for point cloud learning. We achieve transform invariance by learning inner 3D geometry feature based on local graph representation, and propose a feature extraction network based on graph convolution network. Through experiments on classification and segmentation tasks, our method achieves state-of-the-art performance in rotated 3D object classification, and achieve competitive performance with the state-of-the-art in classification and segmentation tasks with fixed coordinate value.}, bibtype = {article}, author = {Pan, Guanghua and Wang, Jun and Ying, Rendong and Liu, Peilin}, number = {1} }
@article{ title = {Adaptive graph convolutional neural networks}, type = {article}, year = {2018}, keywords = {Machine Learning Methods Track}, pages = {3546-3553}, id = {16ebce46-a660-3149-8e12-60b65ad7333e}, created = {2021-09-02T06:33:40.862Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-02T06:34:00.816Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Graph Convolutional Neural Networks (Graph CNNs) are generalizations of classical CNNs to handle graph data such as molecular data, point could and social networks. Current filters in graph CNNs are built for fixed and shared graph structure. However, for most real data, the graph structures varies in both size and connectivity. The paper proposes a generalized and flexible graph CNN taking data of arbitrary graph structure as input. In that way a task-driven adaptive graph is learned for each graph data while training. To efficiently learn the graph, a distance metric learning is proposed. Extensive experiments on nine graph-structured datasets have demonstrated the superior performance improvement on both convergence speed and predictive accuracy.}, bibtype = {article}, author = {Li, Ruoyu and Wang, Sheng and Zhu, Feiyun and Huang, Junzhou}, journal = {32nd AAAI Conference on Artificial Intelligence, AAAI 2018} }
@article{ title = {FoldingNet: Point Cloud Auto-encoder via Deep Grid Deformation BT - Proc. IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)}, type = {article}, year = {2018}, pages = {206-215}, volume = {3}, id = {fb769862-27b6-3935-8cd5-00ad470c190e}, created = {2021-09-02T06:33:40.867Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.472Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yang2018a}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Yang, Yaoqing and Feng, Chen and Shen, Yiru and Tian, Dong}, journal = {Cvpr} }
@article{ title = {General-Purpose Deep Point Cloud Feature Extractor}, type = {article}, year = {2018}, pages = {1972-1981}, volume = {2018-Janua}, publisher = {IEEE}, id = {61335ff6-44f1-3e1b-8456-c415513eaf19}, created = {2021-09-02T06:33:40.878Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:21.919Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Depth sensors used in autonomous driving and gaming systems often report back 3D point clouds. The lack of structure from these sensors does not allow these systems to take advantage of recent advances in convolutional neural networks which are dependent upon traditional filtering and pooling operations. Analogous to image based convolutional architectures, recently introduced graph based architectures afford similar filtering and pooling operations on arbitrary graphs. We adopt these graph based methods to 3D point clouds to introduce a generic vector representation of 3D graphs, we call graph 3D (G3D). We believe we are the first to use large scale transfer learning on 3D point cloud data and demonstrate the discriminant power of our salient latent representation of 3D point clouds on unforeseen test sets. By using our G3D network (G3DNet) as a feature extractor, and then pairing G3D feature vectors with a standard classifier, we achieve the best accuracy on ModelNet10 (93.1%) and ModelNet 40 (91.7%) for a graph network, and comparable performance on the Sydney Urban Objects dataset to other methods. This general-purpose feature extractor can be used as an off-the-shelf component in other 3D scene understanding or object tracking works.}, bibtype = {article}, author = {Dominguez, Miguel and Dhamdhere, Rohan and Petkar, Atir and Jain, Saloni and Sah, Shagan and Ptucha, Raymond}, doi = {10.1109/WACV.2018.00218}, journal = {Proceedings - 2018 IEEE Winter Conference on Applications of Computer Vision, WACV 2018} }
@article{ title = {Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling}, type = {article}, year = {2018}, pages = {4548-4557}, id = {0b904ed1-aa8f-35f2-8710-f8b3265e3761}, created = {2021-09-02T06:33:40.985Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.174Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Shen2018}, private_publication = {false}, abstract = {Unlike on images, semantic learning on 3D point clouds using a deep network is challenging due to the naturally unordered data structure. Among existing works, PointNet has achieved promising results by directly learning on point sets. However, it does not take full advantage of a point's local neighborhood that contains fine-grained structural information which turns out to be helpful towards better semantic learning. In this regard, we present two new operations to improve PointNet with a more efficient exploitation of local structures. The first one focuses on local 3D geometric structures. In analogy to a convolution kernel for images, we define a point-set kernel as a set of learnable 3D points that jointly respond to a set of neighboring data points according to their geometric affinities measured by kernel correlation, adapted from a similar technique for point cloud registration. The second one exploits local high-dimensional feature structures by recursive feature aggregation on a nearest-neighbor-graph computed from 3D positions. Experiments show that our network can efficiently capture local information and robustly achieve better performances on major datasets. Our code is available at http://www.merl.com/research/license#KCNet.}, bibtype = {article}, author = {Shen, Yiru and Feng, Chen and Yang, Yaoqing and Tian, Dong}, doi = {10.1109/CVPR.2018.00478}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Multiresolution tree networks for 3D point cloud processing}, type = {article}, year = {2018}, pages = {105-122}, volume = {11211 LNCS}, id = {59442f29-2b48-36eb-abc6-084e830a5429}, created = {2021-09-09T14:35:21.119Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-21T09:29:24.816Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Gadelha2018}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,1853f94b-7af1-40fa-b068-4758e9a02bc4,103fae48-b63f-495b-9265-9049d2927097,8efc2fe0-ed07-4348-a865-9f1a22b45934,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We present multiresolution tree-structured networks to process point clouds for 3D shape understanding and generation tasks. Our network represents a 3D shape as a set of locality-preserving 1D ordered list of points at multiple resolutions. This allows efficient feed-forward processing through 1D convolutions, coarse-to-fine analysis through a multi-grid architecture, and it leads to faster convergence and small memory footprint during training. The proposed tree-structured encoders can be used to classify shapes and outperform existing point-based architectures on shape classification benchmarks, while tree-structured decoders can be used for generating point clouds directly and they outperform existing approaches for image-to-shape inference tasks learned using the ShapeNet dataset. Our model also allows unsupervised learning of point-cloud based shapes by using a variational autoencoder, leading to higher-quality generated shapes.}, bibtype = {article}, author = {Gadelha, Matheus and Wang, Rui and Maji, Subhransu}, doi = {10.1007/978-3-030-01234-2_7}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Variational Autoencoders for Deforming 3D Mesh Models}, type = {article}, year = {2018}, pages = {5841-5850}, id = {a2b5c17e-72a3-3176-bf6c-b2de31c595e3}, created = {2021-09-09T14:35:21.230Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.048Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Tan2018}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8efc2fe0-ed07-4348-a865-9f1a22b45934,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {3D geometric contents are becoming increasingly popular. In this paper, we study the problem of analyzing deforming 3D meshes using deep neural networks. Deforming 3D meshes are flexible to represent 3D animation sequences as well as collections of objects of the same category, allowing diverse shapes with large-scale non-linear deformations. We propose a novel framework which we call mesh variational autoencoders (mesh VAE), to explore the probabilistic latent space of 3D surfaces. The framework is easy to train, and requires very few training examples. We also propose an extended model which allows flexibly adjusting the significance of different latent variables by altering the prior distribution. Extensive experiments demonstrate that our general framework is able to learn a reasonable representation for a collection of deformable shapes, and produce competitive results for a variety of applications, including shape generation, shape interpolation, shape space embedding and shape exploration, outperforming state-of-the-art methods.}, bibtype = {article}, author = {Tan, Qingyang and Gao, Lin and Lai, Yu Kun and Xia, Shihong}, doi = {10.1109/CVPR.2018.00612}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {A Probe Towards Understanding GAN and VAE Models}, type = {article}, year = {2018}, websites = {http://arxiv.org/abs/1812.05676}, id = {412cbe9c-a847-3eb1-bd74-4a7204e7babd}, created = {2021-09-29T06:15:29.616Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.430Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mi2018}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8ab28090-4ad8-4e96-920f-bd4344219976}, private_publication = {false}, abstract = {This project report compares some known GAN and VAE models proposed prior to 2017. There has been significant progress after we finished this report. We upload this report as an introduction to generative models and provide some personal interpretations supported by empirical evidence. Both generative adversarial network models and variational autoencoders have been widely used to approximate probability distributions of data sets. Although they both use parametrized distributions to approximate the underlying data distribution, whose exact inference is intractable, their behaviors are very different. We summarize our experiment results that compare these two categories of models in terms of fidelity and mode collapse. We provide a hypothesis to explain their different behaviors and propose a new model based on this hypothesis. We further tested our proposed model on MNIST dataset and CelebA dataset.}, bibtype = {article}, author = {Mi, Lu and Shen, Macheng and Zhang, Jingzhao} }
@article{ title = {3D Object Dense Reconstruction from a Single Depth View}, type = {article}, year = {2018}, keywords = {3D Reconstruction,Adversarial Learning,Conditional GAN,Gallium nitride,Image reconstruction,Periodic structures,Shape,Shape Completion,Shape inpainting,Single Depth View,Solid modeling,Task analysis,Three-dimensional displays}, pages = {679-688}, id = {ee4362da-84d4-345f-9a68-fb846c26cf68}, created = {2021-09-29T10:16:08.709Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:41.185Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yang2018}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {In this paper, we propose a novel approach, 3D-RecGAN++, which reconstructs the complete 3D structure of a given object from a single arbitrary depth view using generative adversarial networks. Unlike existing work which typically requires multiple views of the same object or class labels to recover the full 3D geometry, the proposed 3D-RecGAN++ only takes the voxel grid representation of a depth view of the object as input, and is able to generate the complete 3D occupancy grid with a high resolution of 256^3 by recovering the occluded/missing regions. The key idea is to combine the generative capabilities of autoencoders and the conditional Generative Adversarial Networks (GAN) framework, to infer accurate and fine-grained 3D structures of objects in high-dimensional voxel space. Extensive experiments on large synthetic datasets and real-world Kinect datasets show that the proposed 3D-RecGAN++ significantly outperforms the state of the art in single view 3D object reconstruction, and is able to reconstruct unseen types of objects.}, bibtype = {article}, author = {Yang, Bo and Rosa, Stefano and Markham, Andrew and Trigoni, Niki and Wen, Hongkai}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence} }
@article{ title = {Cross-Modal Deep Variational Hand Pose Estimation}, type = {article}, year = {2018}, pages = {89-98}, id = {32dd5cce-07c4-3080-8b52-5c260ab6cda3}, created = {2021-09-29T10:16:08.858Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.474Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Spurr2018}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {The human hand moves in complex and high-dimensional ways, making estimation of 3D hand pose configurations from images alone a challenging task. In this work we propose a method to learn a statistical hand model represented by a cross-modal trained latent space via a generative deep neural network. We derive an objective function from the variational lower bound of the VAE framework and jointly optimize the resulting cross-modal KL-divergence and the posterior reconstruction objective, naturally admitting a training regime that leads to a coherent latent space across multiple modalities such as RGB images, 2D keypoint detections or 3D hand configurations. Additionally, it grants a straightforward way of using semi-supervision. This latent space can be directly used to estimate 3D hand poses from RGB images, outperforming the state-of-the art in different settings. Furthermore, we show that our proposed method can be used without changes on depth images and performs comparably to specialized methods. Finally, the model is fully generative and can synthesize consistent pairs of hand configurations across modalities. We evaluate our method on both RGB and depth datasets and analyze the latent space qualitatively.}, bibtype = {article}, author = {Spurr, Adrian and Song, Jie and Park, Seonwook and Hilliges, Otmar}, doi = {10.1109/CVPR.2018.00017}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {MT-VAE: Learning Motion Transformations to Generate Multimodal Human Dynamics}, type = {article}, year = {2018}, pages = {276-293}, volume = {11209 LNCS}, id = {4877da99-688f-387b-bece-c58c518744e0}, created = {2021-09-29T10:16:08.955Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.866Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yan2018}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {Long-term human motion can be represented as a series of motion modes—motion sequences that capture short-term temporal dynamics—with transitions between them. We leverage this structure and present a novel Motion Transformation Variational Auto-Encoders (MT-VAE) for learning motion sequence generation. Our model jointly learns a feature embedding for motion modes (that the motion sequence can be reconstructed from) and a feature transformation that represents the transition of one motion mode to the next motion mode. Our model is able to generate multiple diverse and plausible motion sequences in the future from the same input. We apply our approach to both facial and full body motion, and demonstrate applications like analogy-based motion transfer and video synthesis.}, bibtype = {article}, author = {Yan, Xinchen and Rastogi, Akash and Villegas, Ruben and Sunkavalli, Kalyan and Shechtman, Eli and Hadap, Sunil and Yumer, Ersin and Lee, Honglak}, doi = {10.1007/978-3-030-01228-1_17}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {1} }
@article{ title = {Supplementary Material for SPLATNet : Sparse Lattice Networks for Point Cloud Processing}, type = {article}, year = {2018}, pages = {2-4}, websites = {http://openaccess.thecvf.com/content_cvpr_2018/papers/Su_SPLATNet_Sparse_Lattice_CVPR_2018_paper.pdf}, id = {3094a4aa-c988-3a6e-8f67-9d9fc96ef056}, created = {2021-10-12T07:10:34.215Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-12T07:10:39.126Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {We present a network architecture for processing point clouds that directly operates on a collection of points represented as a sparse set of samples in a high-dimensional lattice. Na¨ıvelyNa¨ıvely applying convolutions on this lattice scales poorly, both in terms of memory and computational cost, as the size of the lattice increases. Instead, our network uses sparse bilateral convolutional layers as building blocks. These layers maintain efficiency by using indexing structures to apply convolutions only on occupied parts of the lattice , and allow flexible specifications of the lattice structure enabling hierarchical and spatially-aware feature learning, as well as joint 2D-3D reasoning. Both point-based and image-based representations can be easily incorporated in a network with such layers and the resulting model can be trained in an end-to-end manner. We present results on 3D segmentation tasks where our approach outperforms existing state-of-the-art techniques.}, bibtype = {article}, author = {Kautz, Jan}, journal = {Cvpr} }
@article{ title = {DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs}, type = {article}, year = {2018}, keywords = {Convolutional neural networks,atrous convolution,conditional random fields,semantic segmentation}, pages = {834-848}, volume = {40}, publisher = {IEEE}, id = {1b7774e4-5edc-3789-9c33-f21484ba47b9}, created = {2021-11-01T10:14:38.788Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.229Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2018a}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434}, private_publication = {false}, abstract = {In this work we address the task of semantic image segmentation with Deep Learning and make three main contributions that are experimentally shown to have substantial practical merit. First, we highlight convolution with upsampled filters, or 'atrous convolution', as a powerful tool in dense prediction tasks. Atrous convolution allows us to explicitly control the resolution at which feature responses are computed within Deep Convolutional Neural Networks. It also allows us to effectively enlarge the field of view of filters to incorporate larger context without increasing the number of parameters or the amount of computation. Second, we propose atrous spatial pyramid pooling (ASPP) to robustly segment objects at multiple scales. ASPP probes an incoming convolutional feature layer with filters at multiple sampling rates and effective fields-of-views, thus capturing objects as well as image context at multiple scales. Third, we improve the localization of object boundaries by combining methods from DCNNs and probabilistic graphical models. The commonly deployed combination of max-pooling and downsampling in DCNNs achieves invariance but has a toll on localization accuracy. We overcome this by combining the responses at the final DCNN layer with a fully connected Conditional Random Field (CRF), which is shown both qualitatively and quantitatively to improve localization performance. Our proposed 'DeepLab' system sets the new state-of-art at the PASCAL VOC-2012 semantic image segmentation task, reaching 79.7 percent mIOU in the test set, and advances the results on three other datasets: PASCAL-Context, PASCAL-Person-Part, and Cityscapes. All of our code is made publicly available online.}, bibtype = {article}, author = {Chen, Liang Chieh and Papandreou, George and Kokkinos, Iasonas and Murphy, Kevin and Yuille, Alan L.}, doi = {10.1109/TPAMI.2017.2699184}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {4} }
@article{ title = {Deeper insights into graph convolutional networks for semi-supervised learning}, type = {article}, year = {2018}, pages = {3538-3545}, id = {689c9e36-5599-39e8-b40e-add08f784f09}, created = {2021-11-15T11:20:01.783Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-15T11:20:05.263Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Many interesting problems in machine learning are being revisited with new deep learning tools. For graph-based semi-supervised learning, a recent important development is graph convolutional networks (GCNs), which nicely integrate local vertex features and graph topology in the convolutional layers. Although the GCN model compares favorably with other state-of-the-art methods, its mechanisms are not clear and it still requires considerable amount of labeled data for validation and model selection. In this paper, we develop deeper insights into the GCN model and address its fundamental limits. First, we show that the graph convolution of the GCN model is actually a special form of Laplacian smoothing, which is the key reason why GCNs work, but it also brings potential concerns of over-smoothing with many convolutional layers. Second, to overcome the limits of the GCN model with shallow architectures, we propose both co-training and self-training approaches to train GCNs. Our approaches significantly improve GCNs in learning with very few labels, and exempt them from requiring additional labels for validation. Extensive experiments on benchmarks have verified our theory and proposals.}, bibtype = {article}, author = {Li, Qimai and Han, Zhichao and Wu, Xiao Ming}, journal = {32nd AAAI Conference on Artificial Intelligence, AAAI 2018} }
@article{ title = {Learning representations and generative models for 3d point clouds}, type = {article}, year = {2018}, pages = {67-85}, volume = {1}, id = {0f3f1c5d-d7c9-31dd-a7c6-c05b3c11438a}, created = {2021-11-26T10:09:16.292Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.404Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Achlioptas2018}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a,cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {Three-dimensional geometric data offer an excellent domain for studying representation learning ; and generative modeling. In this paper, we look at geometric data represented as point clouds. We t introduce a deep AutoEncoder (AH) network with ; state-of-the-art reconstruction quality and gen- J eralization ability. The learned representations outperform existing methods on 3D recognition i tasks and enable shape editing via simple algebraic manipulations, such as semantic part edit- . ing, shape analogies and shape interpolation, as well as shape completion. We perform a thorough ∗ study of different generative models including GANs operating on the raw point clouds, signifi- . cantly improved GANs trained in the fixed latent ∗ space of our AEs, and Gaussian Mixture Models , (GMMs). To quantitatively evaluate generative models we introduce measures of sample fidelity and diversity based on matchings between sets of point clouds. Interestingly, our evaluation of generalization, fidelity and diversity reveals that . GMMs trained in the latent space of our AEs yield the best results overall.}, bibtype = {article}, author = {Achlioptas, Panos and Diamanti, Olga and Mitliagkas, Ioannis and Guibas, Leonidas}, journal = {35th International Conference on Machine Learning, ICML 2018} }
@article{ title = {Adversarial Attack on Graph Structured Data}, type = {article}, year = {2018}, websites = {http://ml.cs.tsinghua.edu.cn/~jun/pub/graph-attack.pdf}, id = {04689214-67fb-3a4a-8ee8-331fc97147d8}, created = {2022-01-05T09:23:16.008Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:23:45.038Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep learning on graph structures has shown exciting results in various applications. However, few attentions have been paid to the robustness of such models, in contrast to numerous research work for image or text adversarial attack and defense. In this paper, we focus on the adver-sarial attacks that fool deep learning models by modifying the combinatorial structure of data. We first propose a reinforcement learning based attack method that learns the generalizable attack policy, while only requiring prediction labels from the target classifier. We further propose attack methods based on genetic algorithms and gradient descent in the scenario where additional prediction confidence or gradients are available. We use both synthetic and real-world data to show that, a family of Graph Neural Network models are vulnerable to these attacks, in both graph-level and node-level classification tasks. We also show such attacks can be used to diagnose the learned classifiers.}, bibtype = {article}, author = {Dai, Hanjun and Li, Hui and Tian, Tian and Huang, Xin and Wang, Lin and Zhu, Jun and Song, Le} }
@article{ title = {Adversarial Attack and Defense on Graph Data: A Survey}, type = {article}, year = {2018}, pages = {1-18}, websites = {http://arxiv.org/abs/1812.10528}, id = {078a756b-d426-33ac-ad86-f7bdf3d39969}, created = {2022-01-05T09:23:16.498Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:04.950Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Deep neural networks (DNNs) have been widely applied to various applications including image classification, text generation, audio recognition, and graph data analysis. However, recent studies have shown that DNNs are vulnerable to adversarial attacks. Though there are several works studying adversarial attack and defense strategies on domains such as images and natural language processing, it is still difficult to directly transfer the learned knowledge to graph structure data due to its representation challenges. Given the importance of graph analysis, an increasing number of works start to analyze the robustness of machine learning models on graph data. Nevertheless, current studies considering adversarial behaviors on graph data usually focus on specific types of attacks with certain assumptions. In addition, each work proposes its own mathematical formulation which makes the comparison among different methods difficult. Therefore, in this paper, we aim to survey existing adversarial learning strategies on graph data and first provide a unified formulation for adversarial learning on graph data which covers most adversarial learning studies on graph. Moreover, we also compare different attacks and defenses on graph data and discuss their corresponding contributions and limitations. In this work, we systemically organize the considered works based on the features of each topic. This survey not only serves as a reference for the research community, but also brings a clear image researchers outside this research domain. Besides, we also create an online resource and keep updating the relevant papers during the last two years. More details of the comparisons of various studies based on this survey are open-sourced at https://github.com/YingtongDou/graph-adversarial-learning-literature.}, bibtype = {article}, author = {Sun, Lichao and Dou, Yingtong and Yang, Carl and Wang, Ji and Yu, Philip S. and He, Lifang and Li, Bo} }
@article{ title = {Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples}, type = {article}, year = {2018}, keywords = {Adversarial Examples,Deep Learning,Neural Networks,Robustness}, pages = {436-448}, volume = {1}, id = {41bcba3e-f4f2-38d4-a1a0-42299acba571}, created = {2022-01-05T09:23:16.616Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:00.604Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {We identify obfuscated gradients, a kind of gradient masking, as a phenomenon that leads to a false sense of security in defenses against adversarial examples. While defenses that causc obfuscated gradients appear to defeat iterative optimization- based attacks, wc find defenses relying on this effect can be circumvented. We describe characteristic behaviors of defenses exhibiting the effect, and for each of the three types Qf obfuscated gradients we discover, wc develop attack techniques to overcome it. In a case study, examining non- certified white-box-secure defenses at ICLR 2018. we find obfuscated gradients arc a common occurrence, with 7 of 9 defenses relying on obfuscated gradients. Our new attacks successfully circumvent 6 completely, and 1 partially, in the original threat model each paper considers.}, bibtype = {article}, author = {Athalye, Anish and Carlini, Nicholas and Wagner, David}, journal = {35th International Conference on Machine Learning, ICML 2018} }
@article{ title = {Towards deep learning models resistant to adversarial attacks}, type = {article}, year = {2018}, pages = {1-28}, id = {7fd11aed-73c8-3a7c-a47c-829388e3be9c}, created = {2022-01-05T09:23:16.658Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-05T09:24:13.626Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {Recent work has demonstrated that neural networks are vulnerable to adversarial examples, i.e., inputs that are almost indistinguishable from natural data and yet classified incorrectly by the network. To address this problem, we study the adversarial robustness of neural networks through the lens of robust optimization. This approach provides us with a broad and unifying view on much prior work on this topic. Its principled nature also enables us to identify methods for both training and attacking neural networks that are reliable and, in a certain sense, universal. In particular, they specify a concrete security guarantee that would protect against a well-defined class of adversaries. These methods let us train networks with significantly improved resistance to a wide range of adversarial attacks. They also suggest robustness against a first-order adversary as a natural security guarantee. We believe that robustness against such well-defined classes of adversaries is an important stepping stone towards fully resistant deep learning models.}, bibtype = {article}, author = {Madry, Aleksander and Makelov, Aleksandar and Schmidt, Ludwig and Tsipras, Dimitris and Vladu, Adrian}, journal = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings} }
@inproceedings{ title = {VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection}, type = {inproceedings}, year = {2018}, pages = {4490-4499}, id = {c472a0bc-79a2-3fa2-bd4c-63b3e8463b5d}, created = {2022-01-18T11:51:00.983Z}, accessed = {2022-01-18}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-19T16:11:26.108Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Accurate detection of objects in 3D point clouds is a central problem in many applications, such as autonomous navigation, housekeeping robots, and augmented/virtual reality. To interface a highly sparse LiDAR point cloud with a region proposal network (RPN), most existing efforts have focused on hand-crafted feature representations, for example, a bird's eye view projection. In this work, we remove the need of manual feature engineering for 3D point clouds and propose VoxelNet, a generic 3D detection network that unifies feature extraction and bounding box prediction into a single stage, end-to-end trainable deep network. Specifically, VoxelNet divides a point cloud into equally spaced 3D voxels and transforms a group of points within each voxel into a unified feature representation through the newly introduced voxel feature encoding (VFE) layer. In this way, the point cloud is encoded as a descriptive volumetric representation, which is then connected to a RPN to generate detections. Experiments on the KITTI car detection benchmark show that VoxelNet outperforms the state-of-the-art LiDAR based 3D detection methods by a large margin. Furthermore, our network learns an effective discriminative representation of objects with various geometries, leading to encouraging results in 3D detection of pedestrians and cyclists, based on only LiDAR.}, bibtype = {inproceedings}, author = {Zhou, Yin and Tuzel, Oncel}, doi = {10.1109/CVPR.2018.00472}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {CNN for IMU assisted odometry estimation using velodyne LiDAR}, type = {article}, year = {2018}, pages = {71-77}, id = {68cc52fd-1a2f-3674-8c45-90f67e6253ce}, created = {2022-01-19T09:08:51.298Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.240Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Velas2018}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {We introduce a novel method for odometry estimation using convolutional neural networks from 3D LiDAR scans. The original sparse data are encoded into 2D matrices for the training of proposed networks and for the prediction. Our networks show significantly better precision in the estimation of translational motion parameters comparing with state of the art method LOAM, while achieving real-time performance. Together with IMU support, high quality odometry estimation and LiDAR data registration is realized. Moreover, we propose alternative CNNs trained for the prediction of rotational motion parameters while achieving results also comparable with state of the art. The proposed method can replace wheel encoders in odometry estimation or supplement missing GPS data, when the GNSS signal absents (e.g. during the indoor mapping). Our solution brings real-time performance and precision which are useful to provide online preview of the mapping results and verification of the map completeness in real time.}, bibtype = {article}, author = {Velas, Martin and Spanel, Michal and Hradis, Michal and Herout, Adam}, doi = {10.1109/ICARSC.2018.8374163}, journal = {18th IEEE International Conference on Autonomous Robot Systems and Competitions, ICARSC 2018}, number = {621439} }
@article{ title = {Introvae: Introspective variational autoencoders for photographic image synthesis}, type = {article}, year = {2018}, pages = {52-63}, volume = {2018-Decem}, websites = {https://github.com/hhb072/IntroVAE}, id = {17f1e66f-dbc8-3639-b78b-404701f4bcc3}, created = {2022-01-25T08:11:53.505Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.430Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Huang2018}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {We present a novel introspective variational autoencoder (IntroVAE) model for synthesizing high-resolution photographic images. IntroVAE is capable of self-evaluating the quality of its generated samples and improving itself accordingly. Its inference and generator models are jointly trained in an introspective way. On one hand, the generator is required to reconstruct the input images from the noisy outputs of the inference model as normal VAEs. On the other hand, the inference model is encouraged to classify between the generated and real samples while the generator tries to fool it as GANs. These two famous generative frameworks are integrated in a simple yet efficient single-stream architecture that can be trained in a single stage. IntroVAE preserves the advantages of VAEs, such as stable training and nice latent manifold. Unlike most other hybrid models of VAEs and GANs, IntroVAE requires no extra discriminators, because the inference model itself serves as a discriminator to distinguish between the generated and real samples. Experiments demonstrate that our method produces high-resolution photo-realistic images (e.g., CELEBA images at 10242), which are comparable to or better than the state-of-the-art GANs.}, bibtype = {article}, author = {Huang, Huaibo and Li, Zhihang and He, Ran and Sun, Zhenan and Tan, Tieniu}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {PPF-FoldNet: Unsupervised Learning of Rotation Invariant 3D Local Descriptors}, type = {article}, year = {2018}, keywords = {3D deep learning,Descriptors,Local features,Rotation invariance}, pages = {620-638}, volume = {11209 LNCS}, websites = {https://arxiv.org/abs/1808.10322v1}, month = {8}, publisher = {Springer Verlag}, day = {30}, id = {a5392ea2-faaf-306b-8076-4fc7f8adf823}, created = {2022-02-15T12:19:28.244Z}, accessed = {2022-02-15}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-15T12:19:32.245Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We present PPF-FoldNet for unsupervised learning of 3D local descriptors on pure point cloud geometry. Based on the folding-based auto-encoding of well known point pair features, PPF-FoldNet offers many desirable properties: it necessitates neither supervision, nor a sensitive local reference frame, benefits from point-set sparsity, is end-to-end, fast, and can extract powerful rotation invariant descriptors. Thanks to a novel feature visualization, its evolution can be monitored to provide interpretable insights. Our extensive experiments demonstrate that despite having six degree-of-freedom invariance and lack of training labels, our network achieves state of the art results in standard benchmark datasets and outperforms its competitors when rotations and varying point densities are present. PPF-FoldNet achieves $9\%$ higher recall on standard benchmarks, $23\%$ higher recall when rotations are introduced into the same datasets and finally, a margin of $>35\%$ is attained when point density is significantly decreased.}, bibtype = {article}, author = {Deng, Haowen and Birdal, Tolga and Ilic, Slobodan}, doi = {10.1007/978-3-030-01228-1_37}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {PPFNet: Global Context Aware Local Features for Robust 3D Point Matching}, type = {article}, year = {2018}, pages = {195-205}, websites = {https://arxiv.org/abs/1802.02669v2}, month = {2}, publisher = {IEEE Computer Society}, day = {7}, id = {a874fdf5-a627-3270-ac7a-62f0fcbef2da}, created = {2022-02-15T12:20:19.095Z}, accessed = {2022-02-15}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-31T06:33:43.595Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We present PPFNet - Point Pair Feature NETwork for deeply learning a globally informed 3D local feature descriptor to find correspondences in unorganized point clouds. PPFNet learns local descriptors on pure geometry and is highly aware of the global context, an important cue in deep learning. Our 3D representation is computed as a collection of point-pair-features combined with the points and normals within a local vicinity. Our permutation invariant network design is inspired by PointNet and sets PPFNet to be ordering-free. As opposed to voxelization, our method is able to consume raw point clouds to exploit the full sparsity. PPFNet uses a novel $\textitN-tuple$ loss and architecture injecting the global information naturally into the local descriptor. It shows that context awareness also boosts the local feature representation. Qualitative and quantitative evaluations of our network suggest increased recall, improved robustness and invariance as well as a vital step in the 3D descriptor extraction performance.}, bibtype = {article}, author = {Deng, Haowen and Birdal, Tolga and Ilic, Slobodan}, doi = {10.1109/CVPR.2018.00028}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {The Perfect Match: 3D Point Cloud Matching with Smoothed Densities}, type = {article}, year = {2018}, keywords = {3D from Multiview and Sensors,Categorization,Recognition: Detection,Representation Learning,Retrieval,Scene Analysis and Under}, pages = {5540-5549}, volume = {2019-June}, websites = {https://arxiv.org/abs/1811.06879v3}, month = {11}, publisher = {IEEE Computer Society}, day = {16}, id = {d4a377ac-ed79-3a7f-8b3d-8d7a06685cf1}, created = {2022-02-15T12:23:53.324Z}, accessed = {2022-02-15}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-31T06:33:43.745Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We propose 3DSmoothNet, a full workflow to match 3D point clouds with a siamese deep learning architecture and fully convolutional layers using a voxelized smoothed density value (SDV) representation. The latter is computed per interest point and aligned to the local reference frame (LRF) to achieve rotation invariance. Our compact, learned, rotation invariant 3D point cloud descriptor achieves 94.9% average recall on the 3DMatch benchmark data set, outperforming the state-of-the-art by more than 20 percent points with only 32 output dimensions. This very low output dimension allows for near realtime correspondence search with 0.1 ms per feature point on a standard PC. Our approach is sensor- and sceneagnostic because of SDV, LRF and learning highly descriptive features with fully convolutional layers. We show that 3DSmoothNet trained only on RGB-D indoor scenes of buildings achieves 79.0% average recall on laser scans of outdoor vegetation, more than double the performance of our closest, learning-based competitors. Code, data and pre-trained models are available online at https://github.com/zgojcic/3DSmoothNet.}, bibtype = {article}, author = {Gojcic, Zan and Zhou, Caifa and Wegner, Jan D. and Wieser, Andreas}, doi = {10.1109/CVPR.2019.00569}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Isolating Sources of Disentanglement in VAEs}, type = {article}, year = {2018}, id = {f1256e0a-2953-3340-967f-6ab5765668ce}, created = {2022-02-23T06:27:58.910Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.393Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Chen2018}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Chen, Ricky T Q and Li, Xuechen and Grosse, Roger and Duvenaud, David}, number = {NeurIPS} }
@article{ title = {Disentangling by Factorising}, type = {article}, year = {2018}, id = {bc9a83aa-00a8-334d-a34c-196af4260307}, created = {2022-02-23T06:27:58.955Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.720Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kim2018}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Kim, Hyunjik and Mnih, Andriy} }
@article{ title = {Automatic segmentation of tree structure from point cloud data}, type = {article}, year = {2018}, keywords = {RGB-D perception,Robotics in agriculture and forestry,object detection,segmentation and categorization}, pages = {3043-3050}, volume = {3}, publisher = {IEEE}, id = {2a07ce41-4b36-3f0a-a473-e5b16a7e9247}, created = {2022-02-23T07:17:31.793Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-25T10:37:33.321Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5041aa39-a3cf-45bd-ada3-df1401e124f1}, private_publication = {false}, abstract = {Methods for capturing and modeling vegetation, such as trees or plants, typically distinguish between two components - branch skeleton and foliage. Current methods do not provide quantitatively accurate tree structure and foliage density needed for applications such as visualization, inspection, or to estimate vegetation parameters. This letter describes an automatic method for segmenting three-dimensional point cloud data of vegetation, acquired from commodity scanners, into its two main components: branches and leaves, by using geometric features computed directly on the point cloud. In this letter, the specific type of vegetation considered is broadleaf trees. We present a data-driven approach, where a Random forest classifier is used for segmentation. In contrast to state-of-the-art methods, the point cloud is not reduced to a set of primitives such as cylinders. Instead, the algorithm works at the level of the input point cloud itself, preserving quantitative accuracy in the resulting model. Computation of typical vegetation metrics follows naturally from this model. We achieve an average classification accuracy of $91 on simulated data across three different species of broadleaf trees. Qualitative results on real data are also presented.}, bibtype = {article}, author = {Digumarti, Sundara Tejaswi and Nieto, Juan and Cadena, Cesar and Siegwart, Roland and Beardsley, Paul}, doi = {10.1109/LRA.2018.2849499}, journal = {IEEE Robotics and Automation Letters}, number = {4} }
@article{ title = {Large-Scale Point Cloud Semantic Segmentation with Superpoint Graphs}, type = {article}, year = {2018}, pages = {4558-4567}, id = {8aafda9c-e4ea-352f-be4b-a1c5e40ba7d8}, created = {2022-02-24T07:10:18.730Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-24T13:55:40.920Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {We propose a novel deep learning-based framework to tackle the challenge of semantic segmentation of large-scale point clouds of millions of points. We argue that the organization of 3D point clouds can be efficiently captured by a structure called superpoint graph (SPG), derived from a partition of the scanned scene into geometrically homogeneous elements. SPGs offer a compact yet rich representation of contextual relationships between object parts, which is then exploited by a graph convolutional network. Our framework sets a new state of the art for segmenting outdoor LiDAR scans (+11.9 and +8.8 mIoU points for both Semantic3D test sets), as well as indoor scans (+12.4 mIoU points for the S3DIS dataset).}, bibtype = {article}, author = {Landrieu, Loic and Simonovsky, Martin}, doi = {10.1109/CVPR.2018.00479}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Semantic Segmentation of Geometric Primitives in Dense 3D Point Clouds}, type = {article}, year = {2018}, keywords = {Computer Vision,Computer vision tasks,Computing methodologies,Kernel methods,Machine learning approaches,Mixed / Augmented Reality,Scene Understanding,Support vector machines}, pages = {206-211}, id = {fae6821b-5684-3b1f-bc45-d3850d33fe41}, created = {2022-03-09T09:35:04.518Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-09T09:35:10.956Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {This paper presents an approach to semantic segmentation and structural modeling from dense 3D point clouds. The core contribution is an efficient method for fitting of geometric primitives based on machine learning. First, the dense 3D point cloud is acquired together with RGB images on a mobile handheld device. Then, RANSAC is used to estimate the presence of geometric primitives, followed by an evaluation of their fit based on classification of the fitting parameters. Finally, the approach iterates over successive frames to optimize the fitting parameters or replace a detected primitive by a better fitting one. As a result, we obtain a semantic model of the scene consisting of a set of geometric primitives. We evaluate the approach on an extensive set of scenarios and show its plausibility in augmented reality applications.}, bibtype = {article}, author = {Stanescu, Ana and Fleck, Philipp and Schmalstieg, Dieter and Arth, Clemens}, doi = {10.1109/ISMAR-Adjunct.2018.00068}, journal = {Adjunct Proceedings - 2018 IEEE International Symposium on Mixed and Augmented Reality, ISMAR-Adjunct 2018} }
@article{ title = {3D Point Capsule Networks}, type = {article}, year = {2018}, keywords = {Categorization,Deep Learning,RGBD sensors and analytics,Recognition: Detection,Representation Learning,Retrieval,Robotics}, pages = {1009-1018}, volume = {2019-June}, websites = {https://arxiv.org/abs/1812.10775v2}, month = {12}, publisher = {IEEE Computer Society}, day = {27}, id = {f4e073bf-50e5-310c-aafc-d79c85df9585}, created = {2022-03-28T07:16:35.167Z}, accessed = {2022-03-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T07:16:41.721Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {In this paper, we propose 3D point-capsule networks, an auto-encoder designed to process sparse 3D point clouds while preserving spatial arrangements of the input data. 3D capsule networks arise as a direct consequence of our novel unified 3D auto-encoder formulation. Their dynamic routing scheme and the peculiar 2D latent space deployed by our approach bring in improvements for several common point cloud-related tasks, such as object classification, object reconstruction and part segmentation as substantiated by our extensive evaluations. Moreover, it enables new applications such as part interpolation and replacement.}, bibtype = {article}, author = {Zhao, Yongheng and Birdal, Tolga and Deng, Haowen and Tombari, Federico}, doi = {10.48550/arxiv.1812.10775}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {FFJORD: Free-form Continuous Dynamics for Scalable Reversible Generative Models}, type = {article}, year = {2018}, keywords = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1810.01367}, month = {10}, id = {06cd7f32-a634-3513-a727-b7053a332f3d}, created = {2022-03-28T09:45:01.739Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:41.658Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {grathwohlFFJORDFreeformContinuous2018}, source_type = {article}, short_title = {FFJORD}, notes = {arXiv: 1810.01367}, private_publication = {false}, abstract = {A promising class of generative models maps points from a simple distribution to a complex distribution through an invertible neural network. Likelihood-based training of these models requires restricting their architectures to allow cheap computation of Jacobian determinants. Alternatively, the Jacobian trace can be used if the transformation is specified by an ordinary differential equation. In this paper, we use Hutchinson's trace estimator to give a scalable unbiased estimate of the log-density. The result is a continuous-time invertible generative model with unbiased density estimation and one-pass sampling, while allowing unrestricted neural network architectures. We demonstrate our approach on high-dimensional density estimation, image generation, and variational inference, achieving the state-of-the-art among exact likelihood methods with efficient sampling.}, bibtype = {article}, author = {Grathwohl, Will and Chen, Ricky T Q and Bettencourt, Jesse and Sutskever, Ilya and Duvenaud, David}, journal = {arXiv:1810.01367 [cs, stat]} }
@inproceedings{ title = {3D Fetal Skull Reconstruction from 2DUS via Deep Conditional Generative Networks}, type = {inproceedings}, year = {2018}, keywords = {Fetal ultrasound,Generative model,Variational autoencoder}, pages = {383-391}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {e59341b6-2d4c-3b03-a312-6bbd290c24ff}, created = {2022-03-28T09:45:01.746Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:01.746Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {cerrolaza3DFetalSkull2018}, source_type = {inproceedings}, private_publication = {false}, abstract = {2D ultrasound (US) is the primary imaging modality in antenatal healthcare. Despite the limitations of traditional 2D biometrics to characterize the true 3D anatomy of the fetus, the adoption of 3DUS is still very limited. This is particularly significant in developing countries and remote areas, due to the lack of experienced sonographers and the limited access to 3D technology. In this paper, we present a new deep conditional generative network for the 3D reconstruction of the fetal skull from 2DUS standard planes of the head routinely acquired during the fetal screening process. Based on the generative properties of conditional variational autoencoders (CVAE), our reconstruction architecture (REC-CVAE) directly integrates the three US standard planes as conditional variables to generate a unified latent space of the skull. Additionally, we propose HiREC-CVAE, a hierarchical generative network based on the different clinical relevance of each predictive view. The hierarchical structure of HiREC-CVAE allows the network to learn a sequence of nested latent spaces, providing superior predictive capabilities even in the absence of some of the 2DUS scans. The performance of the proposed architectures was evaluated on a dataset of 72 cases, showing accurate reconstruction capabilities from standard non-registered 2DUS images.}, bibtype = {inproceedings}, author = {Cerrolaza, Juan J and Li, Yuanwei and Biffi, Carlo and Gomez, Alberto and Sinclair, Matthew and Matthew, Jacqueline and Knight, Caronline and Kainz, Bernhard and Rueckert, Daniel}, editor = {Frangi, Alejandro F and Schnabel, Julia A and Davatzikos, Christos and Alberola-López, Carlos and Fichtinger, Gabor}, doi = {10.1007/978-3-030-00928-1_44}, booktitle = {Medical Image Computing and Computer Assisted Intervention – MICCAI 2018} }
@article{ title = {FiLM: Visual Reasoning with a General Conditioning Layer}, type = {article}, year = {2018}, keywords = {Language and Vision}, volume = {32}, websites = {https://ojs.aaai.org/index.php/AAAI/article/view/11671}, month = {4}, id = {5dca248e-5b80-3a41-9320-eb25047c6949}, created = {2022-03-28T09:45:01.754Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:39.156Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {perezFiLMVisualReasoning2018}, source_type = {article}, short_title = {FiLM}, notes = {Number: 1}, private_publication = {false}, abstract = {We introduce a general-purpose conditioning method for neural networks called FiLM: Feature-wise Linear Modulation. FiLM layers influence neural network computation via a simple, feature-wise affine transformation based on conditioning information. We show that FiLM layers are highly effective for visual reasoning - answering image-related questions which require a multi-step, high-level process - a task which has proven difficult for standard deep learning methods that do not explicitly model reasoning. Specifically, we show on visual reasoning tasks that FiLM layers 1) halve state-of-the-art error for the CLEVR benchmark, 2) modulate features in a coherent manner, 3) are robust to ablations and architectural modifications, and 4) generalize well to challenging, new data from few examples or even zero-shot.}, bibtype = {article}, author = {Perez, Ethan and Strub, Florian and Vries, Harm de and Dumoulin, Vincent and Courville, Aaron}, journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, number = {1} }
@article{ title = {Isolating Sources of Disentanglement in Variational Autoencoders}, type = {article}, year = {2018}, websites = {https://arxiv.org/abs/1802.04942v5}, month = {2}, id = {cadbac75-f2d0-3482-a13f-e5a01928323c}, created = {2022-03-28T09:45:01.766Z}, accessed = {2022-02-22}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:29.234Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {chenIsolatingSourcesDisentanglement2018}, source_type = {article}, private_publication = {false}, abstract = {We decompose the evidence lower bound to show the existence of a term measuring the total correlation between latent variables. We use this to motivate our \$\textbackslashbeta\$-TCVAE (Total Correlation Variational Autoencoder), a refinement of the state-of-the-art \$\textbackslashbeta\$-VAE objective for learning disentangled representations, requiring no additional hyperparameters during training. We further propose a principled classifier-free measure of disentanglement called the mutual information gap (MIG). We perform extensive quantitative and qualitative experiments, in both restricted and non-restricted settings, and show a strong relation between total correlation and disentanglement, when the latent variables model is trained using our framework.}, bibtype = {article}, author = {Chen, Ricky T Q and Li, Xuechen and Grosse, Roger and Duvenaud, David} }
@inproceedings{ title = {A Large-scale RGB-D Database for Arbitrary-view Human Action Recognition}, type = {inproceedings}, year = {2018}, keywords = {arbitrary-view recognition,cross-view recognition,hri,human action recognition,rgb-d action database}, pages = {1510-1518}, websites = {https://doi.org/10.1145/3240508.3240675}, month = {10}, publisher = {Association for Computing Machinery}, city = {New York, NY, USA}, series = {MM '18}, id = {2b39676e-de0b-3147-965d-fbd15edbf11b}, created = {2022-03-28T09:45:02.078Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:53.520Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {jiLargescaleRGBDDatabase2018}, source_type = {inproceedings}, private_publication = {false}, abstract = {Current researches mainly focus on single-view and multiview human action recognition, which can hardly satisfy the requirements of human-robot interaction (HRI) applications to recognize actions from arbitrary views. The lack of databases also sets up barriers. In this paper, we newly collect a large-scale RGB-D action database for arbitrary-view action analysis, including RGB videos, depth and skeleton sequences. The database includes action samples captured in 8 fixed viewpoints and varying-view sequences which covers the entire 360 view angles. In total, 118 persons are invited to act 40 action categories, and 25,600 video samples are collected. Our database involves more articipants, more viewpoints and a large number of samples. More importantly, it is the first database containing the entire 360? varying-view sequences. The database provides sufficient data for cross-view and arbitrary-view action analysis. Besides, we propose a View-guided Skeleton CNN (VS-CNN) to tackle the problem of arbitrary-view action recognition. Experiment results show that the VS-CNN achieves superior performance.}, bibtype = {inproceedings}, author = {Ji, Yanli and Xu, Feixiang and Yang, Yang and Shen, Fumin and Shen, Heng Tao and Zheng, Wei-Shi}, doi = {10.1145/3240508.3240675}, booktitle = {Proceedings of the 26th ACM international conference on Multimedia} }
@article{ title = {GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium}, type = {article}, year = {2018}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1706.08500}, month = {1}, id = {fbc14d2a-e54b-303f-ae58-9a6f7b5d90de}, created = {2022-03-28T09:45:02.107Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:20.872Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {heuselGANsTrainedTwo2018}, source_type = {article}, notes = {arXiv: 1706.08500}, private_publication = {false}, abstract = {Generative Adversarial Networks (GANs) excel at creating realistic images with complex models for which maximum likelihood is infeasible. However, the convergence of GAN training has still not been proved. We propose a two time-scale update rule (TTUR) for training GANs with stochastic gradient descent on arbitrary GAN loss functions. TTUR has an individual learning rate for both the discriminator and the generator. Using the theory of stochastic approximation, we prove that the TTUR converges under mild assumptions to a stationary local Nash equilibrium. The convergence carries over to the popular Adam optimization, for which we prove that it follows the dynamics of a heavy ball with friction and thus prefers flat minima in the objective landscape. For the evaluation of the performance of GANs at image generation, we introduce the "Fr\textbackslash'echet Inception Distance" (FID) which captures the similarity of generated images to real ones better than the Inception Score. In experiments, TTUR improves learning for DCGANs and Improved Wasserstein GANs (WGAN-GP) outperforming conventional GAN training on CelebA, CIFAR-10, SVHN, LSUN Bedrooms, and the One Billion Word Benchmark.}, bibtype = {article}, author = {Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and Nessler, Bernhard and Hochreiter, Sepp}, journal = {arXiv:1706.08500 [cs, stat]} }
@inproceedings{ title = {Learning Interpretable Anatomical Features Through Deep Generative Models: Application to Cardiac Remodeling}, type = {inproceedings}, year = {2018}, pages = {464-471}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {5894c067-0c04-3253-8f12-00a9af0ffff7}, created = {2022-03-28T09:45:02.901Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:26.781Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {biffiLearningInterpretableAnatomical2018}, source_type = {inproceedings}, short_title = {Learning Interpretable Anatomical Features }, private_publication = {false}, abstract = {Alterations in the geometry and function of the heart define well-established causes of cardiovascular disease. However, current approaches to the diagnosis of cardiovascular diseases often rely on subjective human assessment as well as manual analysis of medical images. Both factors limit the sensitivity in quantifying complex structural and functional phenotypes. Deep learning approaches have recently achieved success for tasks such as classification or segmentation of medical images, but lack interpretability in the feature extraction and decision processes, limiting their value in clinical diagnosis. In this work, we propose a 3D convolutional generative model for automatic classification of images from patients with cardiac diseases associated with structural remodeling. The model leverages interpretable task-specific anatomic patterns learned from 3D segmentations. It further allows to visualise and quantify the learned pathology-specific remodeling patterns in the original input space of the images. This approach yields high accuracy in the categorization of healthy and hypertrophic cardiomyopathy subjects when tested on unseen MR images from our own multi-centre dataset (100\%) as well on the ACDC MICCAI 2017 dataset (90\%). We believe that the proposed deep learning approach is a promising step towards the development of interpretable classifiers for the medical imaging domain, which may help clinicians to improve diagnostic accuracy and enhance patient risk-stratification.}, bibtype = {inproceedings}, author = {Biffi, Carlo and Oktay, Ozan and Tarroni, Giacomo and Bai, Wenjia and De Marvao, Antonio and Doumou, Georgia and Rajchl, Martin and Bedair, Reem and Prasad, Sanjay and Cook, Stuart and O’Regan, Declan and Rueckert, Daniel}, editor = {Frangi, Alejandro F and Schnabel, Julia A and Davatzikos, Christos and Alberola-López, Carlos and Fichtinger, Gabor}, doi = {10.1007/978-3-030-00934-2_52}, booktitle = {Medical Image Computing and Computer Assisted Intervention – MICCAI 2018} }
@article{ title = {Point2Sequence: Learning the Shape Representation of 3D Point Clouds with an Attention-based Sequence to Sequence Network}, type = {article}, year = {2018}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/1811.02565}, month = {11}, id = {2e06262a-58f1-3547-a8c4-58840165dece}, created = {2022-03-28T09:45:03.066Z}, accessed = {2022-03-17}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:46.466Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liuPoint2SequenceLearningShape2018}, source_type = {article}, short_title = {Point2Sequence}, notes = {arXiv: 1811.02565}, private_publication = {false}, abstract = {Exploring contextual information in the local region is important for shape understanding and analysis. Existing studies often employ hand-crafted or explicit ways to encode contextual information of local regions. However, it is hard to capture fine-grained contextual information in hand-crafted or explicit manners, such as the correlation between different areas in a local region, which limits the discriminative ability of learned features. To resolve this issue, we propose a novel deep learning model for 3D point clouds, named Point2Sequence, to learn 3D shape features by capturing fine-grained contextual information in a novel implicit way. Point2Sequence employs a novel sequence learning model for point clouds to capture the correlations by aggregating multi-scale areas of each local region with attention. Specifically, Point2Sequence first learns the feature of each area scale in a local region. Then, it captures the correlation between area scales in the process of aggregating all area scales using a recurrent neural network (RNN) based encoder-decoder structure, where an attention mechanism is proposed to highlight the importance of different area scales. Experimental results show that Point2Sequence achieves state-of-the-art performance in shape classification and segmentation tasks.}, bibtype = {article}, author = {Liu, Xinhai and Han, Zhizhong and Liu, Yu-Shen and Zwicker, Matthias}, journal = {arXiv:1811.02565 [cs]} }
@inproceedings{ title = {Generating 3D Faces using Convolutional Mesh Autoencoders}, type = {inproceedings}, year = {2018}, pages = {704-720}, websites = {https://openaccess.thecvf.com/content_ECCV_2018/html/Anurag_Ranjan_Generating_3D_Faces_ECCV_2018_paper.html}, id = {d85a921d-0613-3293-9c28-dd44f9ecd9ec}, created = {2022-03-28T09:45:03.226Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:41.605Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {ranjanGenerating3DFaces2018}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Ranjan, Anurag and Bolkart, Timo and Sanyal, Soubhik and Black, Michael J} }
@inproceedings{ title = {A Variational Feature Encoding Method of 3D Object for Probabilistic Semantic SLAM}, type = {inproceedings}, year = {2018}, keywords = {Bayes methods,Probabilistic logic,Semantics,Shape,Simultaneous localization and mapping,Solid modeling,Three-dimensional displays}, pages = {3605-3612}, month = {10}, id = {2986f5d2-22a5-39cd-bcc2-dba1c98130d1}, created = {2022-03-28T09:45:03.265Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:12.818Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yuVariationalFeatureEncoding2018}, source_type = {inproceedings}, notes = {ISSN: 2153-0866}, private_publication = {false}, abstract = {This paper presents a feature encoding method of complex 3D objects for high-level semantic features. Recent approaches to object recognition methods become important for semantic simultaneous localization and mapping (SLAM). However, there is a lack of consideration of the probabilistic observation model for 3D objects, as the shape of a 3D object basically follows a complex probability distribution. Furthermore, since the mobile robot equipped with a range sensor observes only a single view, much information of the object shape is discarded. These limitations are the major obstacles to semantic SLAM and view-independent loop closure using 3D object shapes as features. In order to enable the numerical analysis for the Bayesian inference, we approximate the true observation model of 3D objects to tractable distributions. Since the observation likelihood can be obtained from the generative model, we formulate the true generative model for 3D object with the Bayesian networks. To capture these complex distributions, we apply a variational auto-encoder. To analyze the approximated distributions and encoded features, we perform classification with maximum likelihood estimation and shape retrieval.}, bibtype = {inproceedings}, author = {Yu, H W and Lee, B H}, doi = {10.1109/IROS.2018.8593831}, booktitle = {2018 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)} }
@article{ title = {Understanding and Improving Interpolation in Autoencoders via an Adversarial Regularizer}, type = {article}, year = {2018}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1807.07543}, month = {7}, id = {3193c473-6ce0-325c-8866-e0824c10b1ef}, created = {2022-03-28T09:45:03.279Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:26.328Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {berthelotUnderstandingImprovingInterpolation2018}, source_type = {article}, notes = {arXiv: 1807.07543}, private_publication = {false}, abstract = {Autoencoders provide a powerful framework for learning compressed representations by encoding all of the information needed to reconstruct a data point in a latent code. In some cases, autoencoders can "interpolate": By decoding the convex combination of the latent codes for two datapoints, the autoencoder can produce an output which semantically mixes characteristics from the datapoints. In this paper, we propose a regularization procedure which encourages interpolated outputs to appear more realistic by fooling a critic network which has been trained to recover the mixing coefficient from interpolated data. We then develop a simple benchmark task where we can quantitatively measure the extent to which various autoencoders can interpolate and show that our regularizer dramatically improves interpolation in this setting. We also demonstrate empirically that our regularizer produces latent codes which are more effective on downstream tasks, suggesting a possible link between interpolation abilities and learning useful representations.}, bibtype = {article}, author = {Berthelot, David and Raffel, Colin and Roy, Aurko and Goodfellow, Ian}, journal = {arXiv:1807.07543 [cs, stat]} }
@article{ title = {Neural scene representation and rendering}, type = {article}, year = {2018}, pages = {1204-1210}, volume = {360}, websites = {https://www.science.org/doi/full/10.1126/science.aar6170}, month = {6}, id = {be301fb6-71f3-38bb-a645-4952fb746c3e}, created = {2022-03-28T09:45:03.597Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:28.851Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {eslamiNeuralSceneRepresentation2018}, source_type = {article}, notes = {Publisher: American Association for the Advancement of Science}, private_publication = {false}, bibtype = {article}, author = {Eslami, S M Ali and Jimenez Rezende, Danilo and Besse, Frederic and Viola, Fabio and Morcos, Ari S and Garnelo, Marta and Ruderman, Avraham and Rusu, Andrei A and Danihelka, Ivo and Gregor, Karol and Reichert, David P and Buesing, Lars and Weber, Theophane and Vinyals, Oriol and Rosenbaum, Dan and Rabinowitz, Neil and King, Helen and Hillier, Chloe and Botvinick, Matt and Wierstra, Daan and Kavukcuoglu, Koray and Hassabis, Demis}, doi = {10.1126/science.aar6170}, journal = {Science}, number = {6394} }
@article{ title = {A guide to convolution arithmetic for deep learning}, type = {article}, year = {2018}, keywords = {Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Comput,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1603.07285}, month = {1}, id = {022df87d-5f6d-3839-8e10-73cbe008648a}, created = {2022-03-28T09:45:03.618Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:36.063Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {dumoulinGuideConvolutionArithmetic2018}, source_type = {article}, notes = {arXiv: 1603.07285}, private_publication = {false}, abstract = {We introduce a guide to help deep learning practitioners understand and manipulate convolutional neural network architectures. The guide clarifies the relationship between various properties (input shape, kernel shape, zero padding, strides and output shape) of convolutional, pooling and transposed convolutional layers, as well as the relationship between convolutional and transposed convolutional layers. Relationships are derived for various cases, and are illustrated in order to make them intuitive.}, bibtype = {article}, author = {Dumoulin, Vincent and Visin, Francesco}, journal = {arXiv:1603.07285 [cs, stat]} }
@article{ title = {Hyperspherical Variational Auto-Encoders}, type = {article}, year = {2018}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1804.00891}, month = {9}, id = {80a3859e-084c-318b-8598-fbaedd606332}, created = {2022-03-28T09:45:03.638Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:33.433Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {davidsonHypersphericalVariationalAutoEncoders2018}, source_type = {article}, notes = {arXiv: 1804.00891}, private_publication = {false}, abstract = {The Variational Auto-Encoder (VAE) is one of the most used unsupervised machine learning models. But although the default choice of a Gaussian distribution for both the prior and posterior represents a mathematically convenient distribution often leading to competitive results, we show that this parameterization fails to model data with a latent hyperspherical structure. To address this issue we propose using a von Mises-Fisher (vMF) distribution instead, leading to a hyperspherical latent space. Through a series of experiments we show how such a hyperspherical VAE, or \$\textbackslashmathcal\S\\$-VAE, is more suitable for capturing data with a hyperspherical latent structure, while outperforming a normal, \$\textbackslashmathcal\N\\$-VAE, in low dimensions on other data types.}, bibtype = {article}, author = {Davidson, Tim R and Falorsi, Luca and De Cao, Nicola and Kipf, Thomas and Tomczak, Jakub M}, journal = {arXiv:1804.00891 [cs, stat]} }
@article{ title = {Variational image compression with a scale hyperprior}, type = {article}, year = {2018}, keywords = {Computer Science - Information Theory,Electrical Engineering and Systems Science - Imag}, websites = {http://arxiv.org/abs/1802.01436}, month = {5}, id = {c2ad8060-65ff-3140-aa08-50cca292c872}, created = {2022-03-28T09:45:04.129Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:51.950Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {balleVariationalImageCompression2018}, source_type = {article}, notes = {arXiv: 1802.01436}, private_publication = {false}, abstract = {We describe an end-to-end trainable model for image compression based on variational autoencoders. The model incorporates a hyperprior to effectively capture spatial dependencies in the latent representation. This hyperprior relates to side information, a concept universal to virtually all modern image codecs, but largely unexplored in image compression using artificial neural networks (ANNs). Unlike existing autoencoder compression methods, our model trains a complex prior jointly with the underlying autoencoder. We demonstrate that this model leads to state-of-the-art image compression when measuring visual quality using the popular MS-SSIM index, and yields rate-distortion performance surpassing published ANN-based methods when evaluated using a more traditional metric based on squared error (PSNR). Furthermore, we provide a qualitative comparison of models trained for different distortion metrics.}, bibtype = {article}, author = {Ballé, Johannes and Minnen, David and Singh, Saurabh and Hwang, Sung Jin and Johnston, Nick}, journal = {arXiv:1802.01436 [cs, eess, math]} }
@inproceedings{ title = {On the challenges of learning with inference networks on sparse, high-dimensional data}, type = {inproceedings}, year = {2018}, pages = {143-151}, websites = {https://proceedings.mlr.press/v84/krishnan18a.html}, month = {3}, publisher = {PMLR}, id = {53e3b24d-c236-3100-bb7f-9b78d0861b7b}, created = {2022-03-28T09:45:05.006Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:33.265Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {krishnanChallengesLearningInference2018}, source_type = {inproceedings}, notes = {ISSN: 2640-3498}, private_publication = {false}, abstract = {We study parameter estimation in Nonlinear Factor Analysis (NFA) where the generative model is parameterized by a deep neural network. Recent work has focused on learning such models using inference (or recognition) networks; we identify a crucial problem when modeling large, sparse, high-dimensional datasets – underfitting. We study the extent of underfitting, highlighting that its severity increases with the sparsity of the data. We propose methods to tackle it via iterative optimization inspired by stochastic variational inference (Hoffman et al., 2013) and improvements in the data representation used for inference. The proposed techniques drastically improve the ability of these powerful models to fit sparse data, achieving state-of-the-art results on a benchmark text-count dataset and excellent results on the task of top-N recommendation.}, bibtype = {inproceedings}, author = {Krishnan, Rahul and Liang, Dawen and Hoffman, Matthew}, booktitle = {Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics} }
@inproceedings{ title = {Supervised autoencoders: Improving generalization performance with unsupervised regularizers}, type = {inproceedings}, year = {2018}, volume = {31}, websites = {https://proceedings.neurips.cc/paper/2018/hash/2a38a4a9316c49e5a833517c45d31070-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {91fc85f3-b304-3a47-872e-eaae3fbd70a9}, created = {2022-03-28T09:45:05.102Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:24.735Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {leSupervisedAutoencodersImproving2018}, source_type = {inproceedings}, short_title = {Supervised autoencoders}, private_publication = {false}, bibtype = {inproceedings}, author = {Le, Lei and Patterson, Andrew and White, Martha}, booktitle = {Advances in Neural Information Processing Systems} }
@inproceedings{ title = {Learning a Hierarchical Latent-Variable Model of 3D Shapes}, type = {inproceedings}, year = {2018}, keywords = {3D shape learning,Data models,Image reconstruction,Probabilistic logic,Shape,Solid modeling,Task analysis,Three-dimensional displays,generative models,image reconstruction,variational inference}, pages = {542-551}, month = {9}, id = {c8366914-b796-3e42-8559-5b6311e4b9f9}, created = {2022-03-28T09:45:05.246Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:07.938Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {liuLearningHierarchicalLatentVariable2018}, source_type = {inproceedings}, notes = {ISSN: 2475-7888}, private_publication = {false}, abstract = {We propose the Variational Shape Learner (VSL), a generative model that learns the underlying structure of voxelized 3D shapes in an unsupervised fashion. Through the use of skip-connections, our model can successfully learn and infer a latent, hierarchical representation of objects. Furthermore, realistic 3D objects can be easily generated by sampling the VSL's latent probabilistic manifold. We show that our generative model can be trained end-to-end from 2D images to perform single image 3D model retrieval. Experiments show, both quantitatively and qualitatively, the improved generalization of our proposed model over a range of tasks, performing better or comparable to various state-of-the-art alternatives.}, bibtype = {inproceedings}, author = {Liu, Shikun and Giles, Lee and Ororbia, Alexander}, doi = {10.1109/3DV.2018.00068}, booktitle = {2018 International Conference on 3D Vision (3DV)} }
@inproceedings{ title = {High-Resolution Image Synthesis and Semantic Manipulation With Conditional GANs}, type = {inproceedings}, year = {2018}, pages = {8798-8807}, websites = {https://openaccess.thecvf.com/content_cvpr_2018/html/Wang_High-Resolution_Image_Synthesis_CVPR_2018_paper.html}, id = {dbfa019a-19c8-3a48-be5a-5561215dfa96}, created = {2022-03-28T09:45:05.339Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:21:51.955Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wangHighResolutionImageSynthesis2018}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Wang, Ting-Chun and Liu, Ming-Yu and Zhu, Jun-Yan and Tao, Andrew and Kautz, Jan and Catanzaro, Bryan} }
@inproceedings{ title = {FoldingNet: Point Cloud Auto-Encoder via Deep Grid Deformation}, type = {inproceedings}, year = {2018}, pages = {206-215}, websites = {https://openaccess.thecvf.com/content_cvpr_2018/html/Yang_FoldingNet_Point_Cloud_CVPR_2018_paper.html}, id = {1f8c312e-2747-3446-ba1e-2b93eab2e562}, created = {2022-03-28T09:45:05.597Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:50.373Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yangFoldingNetPointCloud2018}, source_type = {inproceedings}, short_title = {FoldingNet}, private_publication = {false}, bibtype = {inproceedings}, author = {Yang, Yaoqing and Feng, Chen and Shen, Yiru and Tian, Dong} }
@article{ title = {Understanding disentangling in \$\textbackslashbeta\$-VAE}, type = {article}, year = {2018}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1804.03599}, month = {4}, id = {3bd8328d-ea8e-3f1c-a287-b23a81d1eefe}, created = {2022-03-28T09:45:05.665Z}, accessed = {2021-12-15}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:32.489Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {burgessUnderstandingDisentanglingBeta2018}, source_type = {article}, notes = {arXiv: 1804.03599}, private_publication = {false}, abstract = {We present new intuitions and theoretical assessments of the emergence of disentangled representation in variational autoencoders. Taking a rate-distortion theory perspective, we show the circumstances under which representations aligned with the underlying generative factors of variation of data emerge when optimising the modified ELBO bound in \$\textbackslashbeta\$-VAE, as training progresses. From these insights, we propose a modification to the training regime of \$\textbackslashbeta\$-VAE, that progressively increases the information capacity of the latent code during training. This modification facilitates the robust learning of disentangled representations in \$\textbackslashbeta\$-VAE, without the previous trade-off in reconstruction accuracy.}, bibtype = {article}, author = {Burgess, Christopher P and Higgins, Irina and Pal, Arka and Matthey, Loic and Watters, Nick and Desjardins, Guillaume and Lerchner, Alexander}, journal = {arXiv:1804.03599 [cs, stat]} }
@inproceedings{ title = {Neural Ordinary Differential Equations}, type = {inproceedings}, year = {2018}, volume = {31}, websites = {https://proceedings.neurips.cc/paper/2018/hash/69386f6bb1dfed68692a24c8686939b9-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {55ba36da-f361-3bd7-b3ad-a2ed39803369}, created = {2022-03-28T09:45:05.681Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:46.205Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {chenNeuralOrdinaryDifferential2018}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Chen, Ricky T Q and Rubanova, Yulia and Bettencourt, Jesse and Duvenaud, David K}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {Attention U-Net: Learning Where to Look for the Pancreas}, type = {article}, year = {2018}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/1804.03999}, month = {5}, id = {7227242e-bfa6-343e-a301-794c63175863}, created = {2022-03-28T09:45:05.973Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:23:30.129Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {oktayAttentionUNetLearning2018}, source_type = {article}, short_title = {Attention U-Net}, notes = {arXiv: 1804.03999}, private_publication = {false}, abstract = {We propose a novel attention gate (AG) model for medical imaging that automatically learns to focus on target structures of varying shapes and sizes. Models trained with AGs implicitly learn to suppress irrelevant regions in an input image while highlighting salient features useful for a specific task. This enables us to eliminate the necessity of using explicit external tissue/organ localisation modules of cascaded convolutional neural networks (CNNs). AGs can be easily integrated into standard CNN architectures such as the U-Net model with minimal computational overhead while increasing the model sensitivity and prediction accuracy. The proposed Attention U-Net architecture is evaluated on two large CT abdominal datasets for multi-class image segmentation. Experimental results show that AGs consistently improve the prediction performance of U-Net across different datasets and training sizes while preserving computational efficiency. The code for the proposed architecture is publicly available.}, bibtype = {article}, author = {Oktay, Ozan and Schlemper, Jo and Folgoc, Loic Le and Lee, Matthew and Heinrich, Mattias and Misawa, Kazunari and Mori, Kensaku and McDonagh, Steven and Hammerla, Nils Y and Kainz, Bernhard and Glocker, Ben and Rueckert, Daniel}, journal = {arXiv:1804.03999 [cs]} }
@article{ title = {DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs}, type = {article}, year = {2018}, keywords = {Computational modeling,Context,Convolution,Convolutional neural networks,Image resolution,Image segmentation,Neural networks,Semantics,atrous convolution,conditional random fields,semantic segmentation}, pages = {834-848}, volume = {40}, month = {4}, id = {b8d89de8-1798-3802-8ee7-262c654d7ce1}, created = {2022-03-28T09:45:06.185Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:13.873Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {chenDeepLabSemanticImage2018}, source_type = {article}, short_title = {DeepLab}, notes = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence}, private_publication = {false}, abstract = {In this work we address the task of semantic image segmentation with Deep Learning and make three main contributions that are experimentally shown to have substantial practical merit. First, we highlight convolution with upsampled filters, or `atrous convolution', as a powerful tool in dense prediction tasks. Atrous convolution allows us to explicitly control the resolution at which feature responses are computed within Deep Convolutional Neural Networks. It also allows us to effectively enlarge the field of view of filters to incorporate larger context without increasing the number of parameters or the amount of computation. Second, we propose atrous spatial pyramid pooling (ASPP) to robustly segment objects at multiple scales. ASPP probes an incoming convolutional feature layer with filters at multiple sampling rates and effective fields-of-views, thus capturing objects as well as image context at multiple scales. Third, we improve the localization of object boundaries by combining methods from DCNNs and probabilistic graphical models. The commonly deployed combination of max-pooling and downsampling in DCNNs achieves invariance but has a toll on localization accuracy. We overcome this by combining the responses at the final DCNN layer with a fully connected Conditional Random Field (CRF), which is shown both qualitatively and quantitatively to improve localization performance. Our proposed “DeepLab” system sets the new state-of-art at the PASCAL VOC-2012 semantic image segmentation task, reaching 79.7 percent mIOU in the test set, and advances the results on three other datasets: PASCAL-Context, PASCAL-Person-Part, and Cityscapes. All of our code is made publicly available online.}, bibtype = {article}, author = {Chen, Liang-Chieh and Papandreou, George and Kokkinos, Iasonas and Murphy, Kevin and Yuille, Alan L}, doi = {10.1109/TPAMI.2017.2699184}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {4} }
@misc{ title = {Variational Autoencoders: A Brief Survey}, type = {misc}, year = {2018}, websites = {https://www.semanticscholar.org/paper/Variational-Autoencoders%3A-A-Brief-Survey-Mittal-Behl/c1630a31e3aa24c9876aa956907a1ea86e9934f4}, id = {166c102d-9453-31a3-82f4-b2c96d0973ab}, created = {2022-03-28T09:45:06.480Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:32.370Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {mittalVariationalAutoencodersBrief2018}, source_type = {misc}, short_title = {Variational Autoencoders}, private_publication = {false}, abstract = {This work aims to develop a through understanding of the variational Autoencoders, look at some of the recent advances in VAEs and highlight the drawbacks of VAEs particularly in text generation. After the whooping success of deep neural networks in machine learning problems, deep generative modeling has come into limelight. Generative modeling is the task of learning the underlying complex distribution which generated a given set of data. One of the popular approach for generative modeling is Variational Autoencoder (VAE) [8] and has received a lot of attention in the past few years reigning over the success of neural networks. Variational Autoencoders are a class of deep generative models based on variational method [3]. In the work, we aim to develop a through understanding of the variational Autoencoders, look at some of the recent advances in VAEs and highlight the drawbacks of VAEs particularly in text generation.}, bibtype = {misc}, author = {Mittal, Mayank and Behl, Harkirat Singh} }
@article{ title = {Motion removal for reliable RGB-D SLAM in dynamic environments}, type = {article}, year = {2018}, keywords = {Codebook model,Dynamic environments,Motion removal,RGB-D SLAM}, pages = {115-128}, volume = {108}, websites = {https://doi.org/10.1016/j.robot.2018.07.002}, publisher = {Elsevier B.V.}, id = {390afb21-36e9-318e-b4ee-071a70ee4d0c}, created = {2022-04-05T05:35:07.922Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:07.922Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {RGB-D data-based Simultaneous Localization and Mapping (RGB-D SLAM) aims to concurrently estimate robot poses and reconstruct traversed environments using RGB-D sensors. Many effective and impressive RGB-D SLAM algorithms have been proposed over the past years. However, virtually all the RGB-D SLAM systems developed so far rely on the static-world assumption. This is because the SLAM performance is prone to be degraded by the moving objects in dynamic environments. In this paper, we propose a novel RGB-D data-based motion removal approach to address this problem. The approach is on-line and does not require prior-known moving-object information, such as semantics or visual appearances. We integrate the approach into the front end of an RGB-D SLAM system. It acts as a pre-processing stage to filter out data that are associated with moving objects. Experimental results demonstrate that our approach is able to improve RGB-D SLAM in various challenging scenarios.}, bibtype = {article}, author = {Sun, Yuxiang and Liu, Ming and Meng, Max Q.H.}, doi = {10.1016/j.robot.2018.07.002}, journal = {Robotics and Autonomous Systems} }
@article{ title = {SLAM-driven robotic mapping and registration of 3D point clouds}, type = {article}, year = {2018}, keywords = {Laser scanning,Mobile robot,Point cloud registration,Simultaneous Localization and Mapping (SLAM)}, pages = {38-48}, volume = {89}, month = {5}, publisher = {Elsevier}, day = {1}, id = {92da360e-5301-3113-8011-a6a337f88548}, created = {2022-06-06T05:48:37.953Z}, accessed = {2022-06-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-07T04:36:41.520Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {With the rapid advancement of laser scanning and photogrammetry technologies, frequent geometric data collection at construction sites by contractors has been increased for the purpose of improving constructability, productivity, and onsite safety. However, the conventional static laser scanning method suffers from operational limitations due to the presence of many occlusions commonly found in a typical construction site. Obtaining a complete scan of a construction site without information loss requires that laser scans are obtained from multiple scanning locations around the site, which also necessitates extra work for registering each scanned point cloud. As an alternate solution to this problem, this paper introduces an autonomous mobile robot which navigates a scan site based on a continuously updated point cloud map. This mobile robot system utilizes the 2D Hector Simultaneous Localization and Mapping (SLAM) technique to estimate real-time positions and orientations of the robot in the x-y plane. Then, the 2D localization information is used to create 3D point clouds of unknown environments in real time to determine its navigation paths as a pre-scanning process. The advantage of this framework is the ability to determine the optimal scan position and scan angle to reduce the scanning time and effort for gathering high resolution point cloud data in real-time. The mobile robot system is able to capture survey-quality RGB-mapped point cloud data, and automatically register the scans for geometric reconstruction of the site. The performance of the overall system was tested in an indoor environment and validated with promising results.}, bibtype = {article}, author = {Kim, Pileun and Chen, Jingdao and Cho, Yong K.}, doi = {10.1016/J.AUTCON.2018.01.009}, journal = {Automation in Construction} }
@article{ title = {iBoW-LCD: An Appearance-based Loop Closure Detection Approach using Incremental Bags of Binary Words*}, type = {article}, year = {2018}, id = {79c39406-991d-36df-bb98-4dc6c838a5b7}, created = {2022-06-23T14:41:35.532Z}, accessed = {2022-06-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-23T14:41:37.556Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {In this paper, we introduce iBoW-LCD, a novel appearance-based loop closure detection method. The presented approach makes use of an incremental Bag-of-Words (BoW) scheme based on binary descriptors to retrieve previously seen similar images, avoiding any vocabulary training stage usually required by classic BoW models. In addition, to detect loop closures, iBoW-LCD builds on the concept of dynamic islands, a simple but effective mechanism to group similar images close in time, which reduces the computational times typically associated to Bayesian frameworks. Our approach is validated using several indoor and outdoor public datasets, taken under different environmental conditions, achieving a high accuracy and outperforming other state-of-the-art solutions. I. INTRODUCTION One of the most important aspects of Simultaneous Lo-calization and Mapping (SLAM) [1] is to correctly manage the perceived information from the environment. Irrespective of the kind of sensor involved, there always intervene unavoidable noise sources that produce inaccurate measurements, leading to inconsistent representations when only raw sensor data is considered. For this reason, SLAM algorithms usually rely on loop closure detection mechanisms, which entail the correct identification of previously visited places. A robust loop closure detection scheme leads to additional constraints for the map generation process, resulting into more consistent representations. Although a variety of sensors have been used for loop closure detection, in the last decades, a high number of visual solutions have emerged, specially motivated by the low cost of cameras, the increase in computing power and the richness of the sensor data provided. Using a camera as the main source of information to undertake the association problem is generically known as appearance-based loop closure detection [2]-[9]. The performance of an appearance-based loop closure detection algorithm is highly influenced by the method used to describe the input images and the ability to retrieve previous images similar to the current one. Regarding image description , recent binary descriptors, such as BRIEF [10], ORB [11], LDB [12] or AKAZE [13], are progressively replacing the classical real-valued descriptors like SIFT [14] or SURF [15], given their reduced storage needs and computational times. As for the next issue, image indexing, the Bag of Words (BoW) model [16], [17] has proven to be an effective solution, specially when used in combination with an inverted index. In this model, the set of detected local features is quantized according}, bibtype = {article}, author = {Garcia-Fidalgo, Emilio and Ortiz, Alberto}, doi = {10.1109/LRA.2018.2849609} }
@article{ title = {Bi-Real Net: Enhancing the performance of 1-bit CNNs with improved representational capability and advanced training algorithm}, type = {article}, year = {2018}, pages = {747-763}, volume = {11219 LNCS}, id = {bc13cae3-8b4f-3c90-ac8d-085b330bc608}, created = {2022-07-05T12:32:33.753Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-05T12:32:34.420Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {9972d981-f25e-4229-94fb-1c4fc6296c30}, private_publication = {false}, abstract = {In this work, we study the 1-bit convolutional neural networks (CNNs), of which both the weights and activations are binary. While being efficient, the classification accuracy of the current 1-bit CNNs is much worse compared to their counterpart real-valued CNN models on the large-scale dataset, like ImageNet. To minimize the performance gap between the 1-bit and real-valued CNN models, we propose a novel model, dubbed Bi-Real net, which connects the real activations (after the 1-bit convolution and/or BatchNorm layer, before the sign function) to activations of the consecutive block, through an identity shortcut. Consequently, compared to the standard 1-bit CNN, the representational capability of the Bi-Real net is significantly enhanced and the additional cost on computation is negligible. Moreover, we develop a specific training algorithm including three technical novelties for 1-bit CNNs. Firstly, we derive a tight approximation to the derivative of the non-differentiable sign function with respect to activation. Secondly, we propose a magnitude-aware gradient with respect to the weight for updating the weight parameters. Thirdly, we pre-train the real-valued CNN model with a clip function, rather than the ReLU function, to better initialize the Bi-Real net. Experiments on ImageNet show that the Bi-Real net with the proposed training algorithm achieves 56.4% and 62.2% top-1 accuracy with 18 layers and 34 layers, respectively. Compared to the state-of-the-arts (e.g., XNOR Net), Bi-Real net achieves up to 10% higher top-1 accuracy with more memory saving and lower computational cost.}, bibtype = {article}, author = {Liu, Zechun and Wu, Baoyuan and Luo, Wenhan and Yang, Xin and Liu, Wei and Cheng, Kwang Ting}, doi = {10.1007/978-3-030-01267-0_44}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {PPF-FoldNet : Unsupervised Learning of Rotation Invariant 3D Local Descriptors Supplementary Material Additional Visualizations of Matching}, type = {article}, year = {2018}, keywords = {3d deep learning,descriptors,local features,rotation invariance}, pages = {1-2}, id = {c3da6018-3352-321e-9edf-46a0641df2d0}, created = {2022-08-18T10:53:48.977Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:54:12.491Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We present PPF-FoldNet for unsupervised learning of 3D local de- scriptors on pure point cloud geometry. Based on the folding-based auto-encoding of well known point pair features, PPF-FoldNet offers many desirable properties: it necessitates neither supervision, nor a sensitive local reference frame, benefits from point-set sparsity, is end-to-end, fast, and can extract powerful rotation in- variant descriptors. Thanks to a novel feature visualization, its evolution can be monitored to provide interpretable insights. Our extensive experiments demon- strate that despite having six degree-of-freedom invariance and lack of training labels, our network achieves state of the art results in standard benchmark datasets and outperforms its competitors when rotations and varying point densities are present. PPF-FoldNet achieves 9% higher recall on standard benchmarks, 23% higher recall when rotations are introduced into the same datasets and finally, a margin of > 35% is attained when point density is significantly decreased.}, bibtype = {article}, author = {Deng, Haowen}, journal = {Eccv} }
@article{ title = {Learning representations and generative models for 3D point clouds}, type = {article}, year = {2018}, id = {22933794-fe10-32fe-b9b1-3122ba69d545}, created = {2022-09-08T17:25:32.202Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-09T14:10:26.272Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, abstract = {Three-dimensional geometric data offer an excellent domain for studying representation learning and generative modeling. In this paper, we look at geometric data represented as point clouds. We introduce a deep autoencoder (AE) network with state-of-the-art reconstruction quality and generalization ability. The learned representations outperform existing methods on 3D recognition tasks and enable basic shape editing via simple algebraic manipulations, such as semantic part editing, shape analogies and shape interpolation. We perform a thorough study of different generative models including: GANs operating on the raw point clouds, significantly improved GANs trained in the fixed latent space of our AEs and, Gaussian mixture models (GMM). For our quantitative evaluation we propose measures of sample fidelity and diversity based on matchings between sets of point clouds. Interestingly, our careful evaluation of generalization, fidelity and diversity reveals that GMMs trained in the latent space of our AEs produce the best results.}, bibtype = {article}, author = {Achlioptas, Panos and Diamanti, Olga and Mitliagkas, Ioannis and Guibas, Leonidas}, journal = {6th International Conference on Learning Representations, ICLR 2018 - Workshop Track Proceedings} }
@article{ title = {Which training methods for GANs do actually converge?}, type = {article}, year = {2018}, pages = {5589-5626}, volume = {8}, id = {54368713-bad8-3e80-89ca-9c73461e833a}, created = {2022-09-08T17:25:32.203Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-21T09:29:25.406Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, abstract = {GAN training for absolutely continuous data and generator distributions. In this paper, we show that the requirement of absolute continuity is necessary: we describe a simple yet prototypical counterexample showing that in the more realistic case of distributions that are not absolutely continuous, unregularized GAN training is not always convergent. Furthermore, we discuss reg- ularization strategies that were recently proposed to stabilize GAN training. Our analysis shows that GAN training with instance noise or zero- centered gradient penalties converges. On the other hand, we show that Wasserstein-GANs and WGAN-GP with a finite number of discriminator updates per generator update do not always converge to the equilibrium point. We discuss these results, leading us to a new explanation for the stability problems of GAN training. Based on our analysis, we extend our convergence results to more general GANs and prove local convergence for simplified gradient penalties even if the generator and data distributions lie on lower dimensional manifolds. We find these penalties to work well in practice and use them to learn high- resolution generative image models for a variety of datasets with little hyperparameter tuning.}, bibtype = {article}, author = {Mescheder, Lars and Geiger, Andreas and Nowozin, Sebastian}, journal = {35th International Conference on Machine Learning, ICML 2018} }
@misc{ title = {Point Cloud GAN}, type = {misc}, year = {2018}, id = {06015396-8661-32cd-9f02-8167c2696392}, created = {2022-09-08T17:25:32.208Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-09T14:10:26.275Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, bibtype = {misc}, author = {Chun-Liang, Li and Yang, Zhang and Póczos, Barnabás and Salakhutdinov, Ruslan} }
@article{ title = {Weakly supervised 3d reconstruction with adversarial constraint}, type = {article}, year = {2018}, keywords = {3D-reconstruction,GAN,computer-vision,weak-supervision}, pages = {263-272}, id = {2ea06f89-6665-3cb6-bc75-fb5979db7e57}, created = {2022-09-08T17:25:32.450Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:59.232Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, abstract = {Supervised 3D reconstruction has witnessed a significant progress through the use of deep neural networks. However, this increase in performance requires large scale annotations of 2D/3D data. In this paper, we explore inexpensive 2D supervision as an alternative for expensive 3D CAD annotation. Specifically, we use foreground masks as weak supervision through a raytrace pooling layer that enables perspective projection and backpropagation. Additionally, since the 3D reconstruction from masks is an ill posed problem, we propose to constrain the 3D reconstruction to the manifold of unlabeled realistic 3D shapes that match mask observations. We demonstrate that learning a log-barrier solution to this constrained optimization problem resembles the GAN objective, enabling the use of existing tools for training GANs. We evaluate and analyze the manifold constrained reconstruction on various datasets for single and multi-view reconstruction of both synthetic and real images.}, bibtype = {article}, author = {Gwak, Junyoung and Choy, Christopher B. and Chandraker, Manmohan and Garg, Animesh and Savarese, Silvio}, doi = {10.1109/3DV.2017.00038}, journal = {Proceedings - 2017 International Conference on 3D Vision, 3DV 2017} }
@article{ title = {Progressive growing of GANs for improved quality, stability, and variation}, type = {article}, year = {2018}, pages = {1-26}, id = {2bace964-b3a2-3776-a3d4-335bc42f73d4}, created = {2022-09-21T09:29:20.054Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-21T09:29:45.681Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, abstract = {We describe a new training methodology for generative adversarial networks. The key idea is to grow both the generator and discriminator progressively: starting from a low resolution, we add new layers that model increasingly fine details as training progresses. This both speeds the training up and greatly stabilizes it, allowing us to produce images of unprecedented quality, e.g., CELEBA images at 10242. We also propose a simple way to increase the variation in generated images, and achieve a record inception score of 8.80 in unsupervised CIFAR10. Additionally, we describe several implementation details that are important for discouraging unhealthy competition between the generator and discriminator. Finally, we suggest a new metric for evaluating GAN results, both in terms of image quality and variation. As an additional contribution, we construct a higher-quality version of the CELEBA dataset.}, bibtype = {article}, author = {Karras, Tero and Aila, Timo and Laine, Samuli and Lehtinen, Jaakko}, journal = {6th International Conference on Learning Representations, ICLR 2018 - Conference Track Proceedings} }
@article{ title = {Learning efficient point cloud generation for dense 3D object reconstruction}, type = {article}, year = {2018}, keywords = {VIsion Track}, pages = {7114-7121}, id = {b30643a6-5e57-3f35-9669-6bc94a70e199}, created = {2022-09-21T09:29:20.055Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-23T09:53:43.658Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {255910b9-b737-4c31-858e-6de1dca0cdb9,b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, abstract = {Conventional methods of 3D object generative modeling learn volumetric predictions using deep networks with 3D convolutional operations, which are direct analogies to classical 2D ones. However, these methods are computationally wasteful in attempt to predict 3D shapes, where information is rich only on the surfaces. In this paper, we propose a novel 3D generative modeling framework to efficiently generate object shapes in the form of dense point clouds. We use 2D convolutional operations to predict the 3D structure from multiple viewpoints and jointly apply geometric reasoning with 2D projection optimization. We introduce the pseudo-renderer, a differentiable module to approximate the true rendering operation, to synthesize novel depth maps for optimization. Experimental results for single-image 3D object reconstruction tasks show that we outperforms state-of-the-art methods in terms of shape similarity and prediction density.}, bibtype = {article}, author = {Lin, Chen Hsuan and Kong, Chen and Lucey, Simon}, doi = {10.1609/aaai.v32i1.12278}, journal = {32nd AAAI Conference on Artificial Intelligence, AAAI 2018} }
@article{ title = {GAL: Geometric adversarial loss for single-view 3D-object reconstruction}, type = {article}, year = {2018}, keywords = {3D Neural network,3D Reconstruction,Adversarial loss,Geometric consistency,Point cloud}, pages = {820-834}, volume = {11212 LNCS}, id = {8424444d-2bae-3abe-9281-aec82d146c37}, created = {2022-09-21T09:29:20.059Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-21T09:29:48.293Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,103fae48-b63f-495b-9265-9049d2927097}, private_publication = {false}, abstract = {In this paper, we present a framework for reconstructing a point-based 3D model of an object from a single-view image. We found distance metrics, like Chamfer distance, were used in previous work to measure the difference of two point sets and serve as the loss function in point-based reconstruction. However, such point-point loss does not constrain the 3D model from a global perspective. We propose adding geometric adversarial loss (GAL). It is composed of two terms where the geometric loss ensures consistent shape of reconstructed 3D models close to ground-truth from different viewpoints, and the conditional adversarial loss generates a semantically-meaningful point cloud. GAL benefits predicting the obscured part of objects and maintaining geometric structure of the predicted 3D model. Both the qualitative results and quantitative analysis manifest the generality and suitability of our method.}, bibtype = {article}, author = {Jiang, Li and Shi, Shaoshuai and Qi, Xiaojuan and Jia, Jiaya}, doi = {10.1007/978-3-030-01237-3_49}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Efficient dense point cloud object reconstruction using deformation vector fields}, type = {article}, year = {2018}, keywords = {3D object reconstruction,Deep learning,Dense point clouds}, pages = {508-524}, volume = {11216 LNCS}, id = {9add8e26-9dce-3126-8df6-e76e27373ed4}, created = {2022-10-03T13:31:09.954Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:23.115Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e07bebd1-ae76-40ed-b298-edc5ed896e0b}, private_publication = {false}, abstract = {Some existing CNN-based methods for single-view 3D object reconstruction represent a 3D object as either a 3D voxel occupancy grid or multiple depth-mask image pairs. However, these representations are inefficient since empty voxels or background pixels are wasteful. We propose a novel approach that addresses this limitation by replacing masks with “deformation-fields”. Given a single image at an arbitrary viewpoint, a CNN predicts multiple surfaces, each in a canonical location relative to the object. Each surface comprises a depth-map and corresponding deformation-field that ensures every pixel-depth pair in the depth-map lies on the object surface. These surfaces are then fused to form the full 3D shape. During training we use a combination of per-view loss and multi-view losses. The novel multi-view loss encourages the 3D points back-projected from a particular view to be consistent across views. Extensive experiments demonstrate the efficiency and efficacy of our method on single-view 3D object reconstruction.}, bibtype = {article}, author = {Li, Kejie and Pham, Trung and Zhan, Huangying and Reid, Ian}, doi = {10.1007/978-3-030-01258-8_31}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {3D steerable CNNs: Learning rotationally equivariant features in volumetric data}, type = {article}, year = {2018}, pages = {10381-10392}, volume = {2018-Decem}, id = {5c3965e5-7738-3e09-8dfb-5a9765982a58}, created = {2023-04-24T07:38:01.471Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:54.823Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Weiler2018}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {We present a convolutional network that is equivariant to rigid body motions. The model uses scalar-, vector-, and tensor fields over 3D Euclidean space to represent data, and equivariant convolutions to map between such representations. These SE(3)-equivariant convolutions utilize kernels which are parameterized as a linear combination of a complete steerable kernel basis, which is derived analytically in this paper. We prove that equivariant convolutions are the most general equivariant linear maps between fields over R3. Our experimental results confirm the effectiveness of 3D Steerable CNNs for the problem of amino acid propensity prediction and protein structure classification, both of which have inherent SE(3) symmetry.}, bibtype = {article}, author = {Weiler, Maurice and Geiger, Mario and Welling, Max and Boomsma, Wouter and Cohen, Taco}, journal = {Advances in Neural Information Processing Systems}, number = {NeurIPS} }
@article{ title = {Collision Avoidance Using Spherical Harmonics}, type = {article}, year = {2018}, keywords = {collision avoidance,path planning,spherical harmonics}, id = {66cbaed1-ba22-3a00-a0e1-918be5008c96}, created = {2023-05-03T13:16:39.683Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.713Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Patrick2021}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, bibtype = {article}, author = {Patrick, Steven D and Bakolas, Efstathios}, number = {1998} }
@article{ title = {Estimation of rotation parameters of three-dimensional images by spherical harmonics analysis}, type = {article}, year = {2018}, keywords = {Estimation of parameters,Pointcloud,Rotation of three-dimensional objects,Spherical harmonics}, pages = {570-576}, volume = {16}, id = {415aafed-b173-3e78-8b36-287ffec840b3}, created = {2023-05-03T13:16:40.171Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.543Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rozhentsov2018}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {The article describes a method for estimating the rotational parameters of three-dimensional objects defined as a cloud of points in three-dimensional space, which is less complex compared to other methods and it can ensure a single-valued solution. The authors propose an approach of vector-field modelsto parametrize images of complex three-dimensional objects. The paper discusses the ways for calculating the expansion coefficients in the basis of spherical harmonics for images of three-dimensional pointcloud objects. The authors offer an approach that provides the possibility of estimating the rotation parameters of three-dimensional objects from the values of the expansion coefficients in the basis of spherical harmonics.}, bibtype = {article}, author = {Rozhentsov, Alexey and Egoshina, Irina and Baev, Alexey and Chernishov, Daniil}, doi = {10.5937/jaes16-18157}, journal = {Journal of Applied Engineering Science}, number = {4} }
@article{ title = {Spherical Harmonics Decomposition in inverse acoustic methods involving spherical arrays}, type = {article}, year = {2018}, keywords = {Acoustic measurements,Inverse methods,Microphone arrays,Noise source localization,Spherical Harmonics Decomposition}, pages = {425-460}, volume = {433}, websites = {https://doi.org/10.1016/j.jsv.2018.05.001}, publisher = {Elsevier Ltd}, id = {7c32468a-f9eb-33fe-9d65-f84557701eb7}, created = {2023-05-03T13:16:40.328Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.708Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Battista2018}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Inverse methods for acoustic source mapping have gathered the attention of the beamforming community during the last years. Indeed, they provide higher accuracy in both source localization and strength estimation with respect to Conventional Beamforming (CB). One of the main drawbacks of the current formulations is the need of a regularization strategy for tackling the ill-posedness of the problem. Very often, Tikhonov regularization is exploited to face this issue, but different methods for estimating the regularization factor associated to the Tikhonov formulation may lead to different regularization levels and, therefore, to different results. This paper presents a way to face this problem when dealing with spherical arrays. The new approach proposed by the authors exploits Spherical Harmonics Decomposition (SHD) of complex pressure data at microphone locations. SHD performs a spatial filtering that reduces the effect of noise and causes an intrinsic stabilization of the numerical problem associated to the inverse problem formulation. When the source-receiver propagation model is appropriate to describe the acoustic environment in which the test takes place and noise is not spoiling excessively measurement data, the SHD approach is sufficient to obtain a regularized solution. If these conditions are not satisfied, SHD can be exploited as a pre-processing step in a twofold procedure also involving classical Tikhonov regularization. In this paper the SHD approach is tested in the Generalized Inverse Beamforming (GIBF) formulation. Classical Tikhonov approach, in which the regularization factor is estimated using the Generalized Cross-Validation (GCV), L-curve functions and Bayesian regularization, is presented as a way to enhance data processed by SHD. A sensitivity analysis of the approach to measurement noise and source-receiver relative positions is presented on simulated data. Results on experimental data are presented and discussed for both a simplified test case and an application to a real car cabin.}, bibtype = {article}, author = {Battista, G. and Chiariotti, P. and Castellini, P.}, doi = {10.1016/j.jsv.2018.05.001}, journal = {Journal of Sound and Vibration} }
@article{ title = {Semi-analytical models of non-spherical particle shapes using optimised spherical harmonics}, type = {article}, year = {2018}, keywords = {Mathematical modelling,Non-spherical particles,Optimisation,Spherical harmonics,Surface segmentation}, pages = {376-394}, volume = {137}, id = {f9eddbaa-bd5a-3ab9-826d-db8ccfd34845}, created = {2023-05-03T13:16:40.451Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.633Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Radvilaite2018}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Determining particle shape is vital for many industrial processes such as those found in the pharmaceutical, agricultural, and bioenergy industries. With modelling being an essential tool to acquire an understanding of the behaviour of particulates in industrial processes, numerical methods such as DEM are needing numerical solutions to formulate and implement particle shape models that overcome current limitations. Whereas pharmaceutical particles have a regular shape, agricultural and biomass particles often are specific, irregular and non-analytic. Because the diversity of real shapes is enormous, a variety of methods for describing particle shapes currently exist. Recently, the series of spherical harmonics (SHs) has gained much interest through their application in many other fields. This paper focuses on the application of the semi-analytical SH technique and addresses the development of a universal modelling tool for describing different particle shapes using a finite number of SHs. The results obtained from modelling pharmaceutical, agricultural, and biomass particles prove the applicability of SHs to regular as well as irregular shapes. In this regard, their optimised description by minimising the number of non-zero expansion coefficients is demonstrated. To proceed with a smaller number of low-order SHs, surface segmentation is introduced. Sufficient accuracy in the shape description of the particles selected was achieved with less than 16 SHs.}, bibtype = {article}, author = {Radvilaitė, Urtė and Ramírez-Gómez, Álvaro and Rusakevičius, Dainius and Kačianauskas, Rimantas}, doi = {10.1016/j.cherd.2018.07.031}, journal = {Chemical Engineering Research and Design}, number = {2013} }
@article{ title = {Spherical harmonics entropy for optimal 3D modeling}, type = {article}, year = {2018}, keywords = {33c55,3d images,68u10,94a17,entropy,multiresolution,pacs,preprint submitted to elsevier,spherical harmonics}, websites = {http://arxiv.org/abs/1805.08084}, id = {9cb36d25-df25-343a-a2c9-418a7bfe30df}, created = {2023-05-03T13:16:40.745Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.536Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jallouli2018}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {3D image processing constitutes nowadays a challenging topic in many scientific fields such as medicine, computational physics and informatics. Therefore, development of suitable tools that guaranty a best treatment is a necessity. Spherical shapes are a big class of 3D images whom processing necessitates adoptable tools. This encourages researchers to develop spherical wavelets and spherical harmonics as special mathematical bases able for 3D spherical shapes. The present work lies in the whole topic of 3D image processing with the special spherical harmonics bases. A spherical harmonics based approach is proposed for the reconstruction of images provided with spherical harmonics Shannon-type entropy to evaluate the order/disorder of the reconstructed image. Efficiency and accuracy of the approach is demonstrated by a simulation study on several spherical models.}, bibtype = {article}, author = {Jallouli, Malika and Khalifa, Wafa Bel Hadj and Mabrouk, Anouar Ben and Mahjoub, Mohamed Ali}, number = {May 2018} }
@article{ title = {FPGA-based high-performance embedded systems for adaptive edge computing in cyber-physical systems: The ARTICo3 framework}, type = {article}, year = {2018}, keywords = {Cyber-physical systems,Dynamic and partial reconfiguration,Edge computing,Energy efficiency,FPGAs,Fault tolerance}, volume = {18}, month = {6}, publisher = {MDPI AG}, day = {8}, id = {53fccb55-92e9-3eb2-ad6d-33c3ee319972}, created = {2023-11-07T09:59:07.868Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:13:33.547Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Cyber-Physical Systems are experiencing a paradigm shift in which processing has been relocated to the distributed sensing layer and is no longer performed in a centralized manner. This approach, usually referred to as Edge Computing, demands the use of hardware platforms that are able to manage the steadily increasing requirements in computing performance, while keeping energy efficiency and the adaptability imposed by the interaction with the physical world. In this context, SRAM-based FPGAs and their inherent run-time reconfigurability, when coupled with smart power management strategies, are a suitable solution. However, they usually fail in user accessibility and ease of development. In this paper, an integrated framework to develop FPGA-based high-performance embedded systems for Edge Computing in Cyber-Physical Systems is presented. This framework provides a hardware-based processing architecture, an automated toolchain, and a runtime to transparently generate and manage reconfigurable systems from high-level system descriptions without additional user intervention. Moreover, it provides users with support for dynamically adapting the available computing resources to switch the working point of the architecture in a solution space defined by computing performance, energy consumption and fault tolerance. Results show that it is indeed possible to explore this solution space at run time and prove that the proposed framework is a competitive alternative to software-based edge computing platforms, being able to provide not only faster solutions, but also higher energy efficiency for computing-intensive algorithms with significant levels of data-level parallelism.}, bibtype = {article}, author = {Rodríguez, Alfonso and Valverde, Juan and Portilla, Jorge and Otero, Andrés and Riesgo, Teresa and De La Torre, Eduardo}, doi = {10.3390/s18061877}, journal = {Sensors (Switzerland)}, number = {6} }
@inproceedings{ title = {Understanding Performance Differences of FPGAs and GPUs}, type = {inproceedings}, year = {2018}, keywords = {Analytical model,FPGA,GPU,Performance comparison}, pages = {93-96}, month = {9}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {7}, id = {00101d6e-4164-397a-a01f-35d6a399c57f}, created = {2023-11-07T10:04:25.472Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:16:08.507Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {This paper aims to better understand the performance differences between FPGAs and GPUs. We intentionally begin with a widely used GPU-friendly benchmark suite, Rodinia, and port 15 of the kernels onto FPGAs using HLS C. Then we propose an analytical model to compare their performance. We find that for 6 out of the 15 ported kernels, today's FPGAs can provide comparable performance or even achieve better performance than the GPU, while consuming an average of 28% of the GPU power. Besides lower clock frequency, FPGAs usually achieve a higher number of operations per cycle in each customized deep pipeline, but lower effective parallel factor due to the far lower off-chip memory bandwidth. With 4x more memory bandwidth, 8 out of the 15 FPGA kernels are projected to achieve at least half of the GPU kernel performance.}, bibtype = {inproceedings}, author = {Cong, Jason and Fang, Zhenman and Lo, Michael and Wang, Hanrui and Xu, Jingxian and Zhang, Shaochong}, doi = {10.1109/FCCM.2018.00023}, booktitle = {Proceedings - 26th IEEE International Symposium on Field-Programmable Custom Computing Machines, FCCM 2018} }
@article{ title = {Robust Estimation of Object Dimensions and External Defect Detection with a Low-Cost Sensor}, type = {article}, year = {2017}, keywords = {3D reconstruction,Defect detection,Depth camera,Point cloud,Volume measurement}, volume = {36}, id = {14bd3f67-b507-3763-af02-1a1f04ddb31b}, created = {2020-09-14T08:14:53.697Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-27T07:13:11.949Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, abstract = {The measurement of object dimensions as well as the detection and localization of external defects are of large importance for many sectors in industry including agriculture, transportation and production. In this paper we investigate the feasibility of using commercial depth-sensing devices, based on a time-of-flight technology, such as the Kinect v2 camera, for the measurement and inspection of cuboidal objects (boxes). This paper presents a simplified system using only one Kinect sensor. At the beginning, object dimensions are roughly estimated by discovering the best-fit planes for a cloud of point based on a modified version of RANSAC (RANdom Sample Consensus). The precise geometry and morphology of the objects are then achieved by a transformation from depth to RGB representation of the points estimated as belonging to the object. RGB representation is finally processed (using scanlines on the RGB plane perpendicular to the initial edge estimate) to approximate at best the contour of the bounding box. In addition to the above, the paper proposes a method to automatically highlight defects on the objects’ surfaces: this inspection task is performed through the analysis of both the 2D object contours and the histogram of the normalized depth values. The proposed methodology takes a few seconds to deliver the results for the monitored object and, it experienced encouraging results in terms of accuracy. Indeed, the system measured the dimensions of a set of cuboidal objects with an average error of 5 mm and it was able to identify and locate defects and holes on lateral and topmost surfaces. The experimental outcomes pointed out that the system could be effectively exploited within industrial inspection applications, even more so if the low cost of the system is taken under consideration.}, bibtype = {article}, author = {Leo, Marco and Natale, Anna and Del-Coco, Marco and Carcagnì, Pierluigi and Distante, Cosimo}, doi = {10.1007/s10921-017-0395-7}, journal = {Journal of Nondestructive Evaluation}, number = {1} }
@article{ title = {Deep Learning for Confidence Information in Stereo and ToF Data Fusion}, type = {article}, year = {2017}, pages = {697-705}, volume = {2018-Janua}, id = {c5e00d1c-d1a2-363e-8473-1520a1b8f4de}, created = {2020-09-14T10:49:26.373Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-20T09:48:08.170Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {This paper proposes a novel framework for the fusion of depth data produced by a Time-of-Flight (ToF) camera and a stereo vision system. The key problem of balancing between the two sources of information is solved by extracting confidence maps for both sources using deep learning. We introduce a novel synthetic dataset accurately representing the data acquired by the proposed setup and use it to train a Convolutional Neural Network architecture. The machine learning framework estimates the reliability of both data sources at each pixel location. The two depth fields are finally fused enforcing the local consistency of depth data taking into account the confidence information. Experimental results show that the proposed approach increases the accuracy of the depth estimation.}, bibtype = {article}, author = {Agresti, Gianluca and Minto, Ludovico and Marin, Giulio and Zanuttigh, Pietro}, doi = {10.1109/ICCVW.2017.88}, journal = {Proceedings - 2017 IEEE International Conference on Computer Vision Workshops, ICCVW 2017} }
@article{ title = {Geometric Deep Learning: Going beyond Euclidean data}, type = {article}, year = {2017}, pages = {18-42}, volume = {34}, id = {d177304e-9d5c-323c-bf97-17356b8c44a6}, created = {2020-09-17T08:36:01.288Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T14:53:38.749Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,16688d52-1757-4ef4-badb-f53b700252a9,07e07de9-bcac-4934-a82b-d0aff540e56d}, private_publication = {false}, abstract = {Many scientific fields study data with an underlying structure that is non-Euclidean. Some examples include social networks in computational social sciences, sensor networks in communications, functional networks in brain imaging, regulatory networks in genetics, and meshed surfaces in computer graphics. In many applications, such geometric data are large and complex (in the case of social networks, on the scale of billions) and are natural targets for machine-learning techniques. In particular, we would like to use deep neural networks, which have recently proven to be powerful tools for a broad range of problems from computer vision, natural-language processing, and audio analysis. However, these tools have been most successful on data with an underlying Euclidean or grid-like structure and in cases where the invariances of these structures are built into networks used to model them.}, bibtype = {article}, author = {Bronstein, Michael M. and Bruna, Joan and Lecun, Yann and Szlam, Arthur and Vandergheynst, Pierre}, doi = {10.1109/MSP.2017.2693418}, journal = {IEEE Signal Processing Magazine}, number = {4} }
@article{ title = {Robust Estimation of Object Dimensions and External Defect Detection with a Low-Cost Sensor}, type = {article}, year = {2017}, keywords = {3D reconstruction,Defect detection,Depth camera,Point cloud,Volume measurement}, pages = {1-16}, volume = {36}, publisher = {Springer US}, id = {3bdd5f13-1000-3c82-b78e-a58df42d2a81}, created = {2020-10-05T10:47:09.263Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T14:53:36.127Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,16688d52-1757-4ef4-badb-f53b700252a9}, private_publication = {false}, abstract = {The measurement of object dimensions as well as the detection and localization of external defects are of large importance for many sectors in industry including agriculture, transportation and production. In this paper we investigate the feasibility of using commercial depth-sensing devices, based on a time-of-flight technology, such as the Kinect v2 camera, for the measurement and inspection of cuboidal objects (boxes). This paper presents a simplified system using only one Kinect sensor. At the beginning, object dimensions are roughly estimated by discovering the best-fit planes for a cloud of point based on a modified version of RANSAC (RANdom Sample Consensus). The precise geometry and morphology of the objects are then achieved by a transformation from depth to RGB representation of the points estimated as belonging to the object. RGB representation is finally processed (using scanlines on the RGB plane perpendicular to the initial edge estimate) to approximate at best the contour of the bounding box. In addition to the above, the paper proposes a method to automatically highlight defects on the objects’ surfaces: this inspection task is performed through the analysis of both the 2D object contours and the histogram of the normalized depth values. The proposed methodology takes a few seconds to deliver the results for the monitored object and, it experienced encouraging results in terms of accuracy. Indeed, the system measured the dimensions of a set of cuboidal objects with an average error of 5 mm and it was able to identify and locate defects and holes on lateral and topmost surfaces. The experimental outcomes pointed out that the system could be effectively exploited within industrial inspection applications, even more so if the low cost of the system is taken under consideration.}, bibtype = {article}, author = {Leo, Marco and Natale, Anna and Del-Coco, Marco and Carcagnì, Pierluigi and Distante, Cosimo}, doi = {10.1007/s10921-017-0395-7}, journal = {Journal of Nondestructive Evaluation}, number = {1} }
@article{ title = {A REVIEW OF POINT CLOUDS SEGMENTATION AND CLASSIFICATION ALGORITHMS}, type = {article}, year = {2017}, keywords = {classification,laser scanning,photogrammetry,point clouds,segmentation}, pages = {1-3}, volume = {XLII}, id = {8ce9ffc3-0f37-365b-ae6f-59aa7ae5e868}, created = {2020-10-20T09:48:06.311Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:00:50.533Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Grilli, E and Menna, F and Remondino, F and Scanning, Laser and Scanner, Laser}, doi = {10.5194/isprs-archives-XLII-2-W3-339-2017}, number = {March} }
@article{ title = {DeepToF: Off-the-shelf real-time correction of multipath interference in time-of-flight imaging}, type = {article}, year = {2017}, keywords = {Depth cameras,Learning,Multipath interference,Time-of-Flight}, volume = {36}, id = {6cd34b5e-226e-3f3c-8b15-c05dbe9ca1c9}, created = {2020-11-05T08:16:34.366Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:52.327Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {Time-of-flight (ToF) imaging has become a widespread technique for depth estimation, allowing affordable off-the-shelf cameras to provide depth maps in real time. However, multipath interference (MPI) resulting from indirect illumination significantly degrades the captured depth. Most previous works have tried to solve this problem by means of complex hardware modifications or costly computations. In this work, we avoid these approaches and propose a new technique to correct errors in depth caused by MPI, which requires no camera modifications and takes just 10 milliseconds per frame. Our observations about the nature of MPI suggest that most of its information is available in image space; this allows us to formulate the depth imaging process as a spatially-varying convolution and use a convolutional neural network to correct MPI errors. Since the input and output data present similar structure, we base our network on an autoencoder, which we train in two stages. First, we use the encoder (convolution filters) to learn a suitable basis to represent MPI-corrupted depth images; then, we train the decoder (deconvolution filters) to correct depth from synthetic scenes, generated by using a physically-based, time-resolved renderer. This approach allows us to tackle a key problem in ToF, the lack of ground-truth data, by using a large-scale captured training set with MPI-corrupted depth to train the encoder, and a smaller synthetic training set with ground truth depth to train the decoder stage of the network. We demonstrate and validate our method on both synthetic and real complex scenarios, using an off-the-shelf ToF camera, and with only the captured, incorrect depth as input.}, bibtype = {article}, author = {Marco, Julio and Hernandez, Quercus and Muñoz, Adolfo and Dong, Yue and Jarabo, Adrian and Kim, Min H. and Tong, Xin and Gutierrez, Diego}, doi = {10.1145/3130800.3130884}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {3D Point Cloud Classification and Segmentation using 3D Modified Fisher Vector Representation for Convolutional Neural Networks}, type = {article}, year = {2017}, websites = {http://arxiv.org/abs/1711.08241}, id = {a04e66ed-8f92-38ad-95f4-dbffef651b92}, created = {2020-11-09T09:00:51.422Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-09T09:22:05.831Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {The point cloud is gaining prominence as a method for representing 3D shapes, but its irregular format poses a challenge for deep learning methods. The common solution of transforming the data into a 3D voxel grid introduces its own challenges, mainly large memory size. In this paper we propose a novel 3D point cloud representation called 3D Modified Fisher Vectors (3DmFV). Our representation is hybrid as it combines the discrete structure of a grid with continuous generalization of Fisher vectors, in a compact and computationally efficient way. Using the grid enables us to design a new CNN architecture for point cloud classification and part segmentation. In a series of experiments we demonstrate competitive performance or even better than state-of-the-art on challenging benchmark datasets.}, bibtype = {article}, author = {Ben-Shabat, Yizhak and Lindenbaum, Michael and Fischer, Anath} }
@article{ title = {Deep projective 3D semantic segmentation}, type = {article}, year = {2017}, keywords = {Deep learning,Multi-stream deep networks,Point clouds,Semantic segmentation}, pages = {95-107}, volume = {10424 LNCS}, id = {cebbd3fa-3194-3a8a-82be-c6f363128e02}, created = {2020-11-13T11:34:36.779Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:35:44.753Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Semantic segmentation of 3D point clouds is a challenging problem with numerous real-world applications. While deep learning has revolutionized the field of image semantic segmentation, its impact on point cloud data has been limited so far. Recent attempts, based on 3D deep learning approaches (3D-CNNs), have achieved below-expected results. Such methods require voxelizations of the underlying point cloud data, leading to decreased spatial resolution and increased memory consumption. Additionally, 3D-CNNs greatly suffer from the limited availability of annotated datasets. In this paper, we propose an alternative framework that avoids the limitations of 3D-CNNs. Instead of directly solving the problem in 3D, we first project the point cloud onto a set of synthetic 2D-images. These images are then used as input to a 2D-CNN, designed for semantic segmentation. Finally, the obtained prediction scores are re-projected to the point cloud to obtain the segmentation results. We further investigate the impact of multiple modalities, such as color, depth and surface normals, in a multi-stream network architecture. Experiments are performed on the recent Semantic3D dataset. Our approach sets a new state-of-the-art by achieving a relative gain of 7.9%, compared to the previous best approach.}, bibtype = {article}, author = {Lawin, Felix Järemo and Danelljan, Martin and Tosteberg, Patrik and Bhat, Goutam and Khan, Fahad Shahbaz and Felsberg, Michael}, doi = {10.1007/978-3-319-64689-3_8}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Infrared colorization using deep convolutional neural networks}, type = {article}, year = {2017}, pages = {61-68}, id = {f2063758-8af4-32df-ac2c-fb2f8f71fa3c}, created = {2020-11-16T11:26:16.159Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-16T11:26:20.435Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {This paper proposes a method for transferring the RGB color spectrum to near-infrared (NIR) images using deep multi-scale convolutional neural networks. A direct and integrated transfer between NIR and RGB pixels is trained. The trained model does not require any user guidance or a reference image database in the recall phase to produce images with a natural appearance. To preserve the rich details of the NIR image, its high frequency features are transferred to the estimated RGB image. The presented approach is trained and evaluated on a real-world dataset containing a large amount of road scene images in summer. The dataset was captured by a multi-CCD NIR/RGB camera, which ensures a perfect pixel to pixel registration.}, bibtype = {article}, author = {Limmer, Matthias and Lensch, Hendrik P.A.}, doi = {10.1109/ICMLA.2016.114}, journal = {Proceedings - 2016 15th IEEE International Conference on Machine Learning and Applications, ICMLA 2016} }
@article{ title = {Fusion of stereo vision for pedestrian recognition using convolutional neural networks}, type = {article}, year = {2017}, pages = {47-52}, id = {96c176a4-3a10-33cf-9dd1-af1fa93d9c82}, created = {2020-11-16T11:56:20.667Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:17.008Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f3937f49-b3bb-4fad-a335-4fb2125beca8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Pedestrian detection is a highly debated issue in the sci- entific community due to its outstanding importance for a large number of applications, especially in the fields of automotive safety, robotics and surveillance. In spite of the widely varying methods developed in recent years, pedestrian detection is still an open challenge whose accuracy and robustness has to be improved. Therefore, in this paper, we focus on im- proving the classification component in the pedestrian detection task on the Daimler stereo vision data set by adopting two approaches: 1) by com- bining three image modalities (intensity, depth and ow) to feed a unique convolutional neural network (CNN) and 2) by fusing the results of three independent CNNs.}, bibtype = {article}, author = {Pop, Dǎnuţ Ovidiu and Rogozan, Alexandrina and Nashashibi, Fawzi and Bensrhair, Abdelaziz}, journal = {ESANN 2017 - Proceedings, 25th European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning} }
@article{ title = {View/state planning for three-dimensional object reconstruction under uncertainty}, type = {article}, year = {2017}, keywords = {Motion planning,Next best view,Object reconstruction,Uncertainty}, pages = {89-109}, volume = {41}, publisher = {Springer US}, id = {db0170a1-6c49-39cf-bdcc-28d8a6b1e815}, created = {2021-01-25T14:53:33.640Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-22T14:53:30.368Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,08ef6744-0b10-438e-84da-dc07f1f8ab47}, private_publication = {false}, abstract = {We propose a holistic approach for three-dimensional (3D) object reconstruction with a mobile manipulator robot with an eye-in-hand sensor; considering the plan to reach the desired view/state, and the uncertainty in both observations and controls. This is one of the first methods that determines the next best view/state in the state space, following a methodology in which a set of candidate views/states is directly generated in the state space, and later only a subset of these views is kept by filtering the original set. It also determines the controls that yield a collision free trajectory to reach a state using rapidly-exploring random trees. To decrease the processing time we propose an efficient evaluation strategy based on filters, and a 3D visibility calculation with hierarchical ray tracing. The next best view/state is selected based on the expected utility, generating samples in the control space based on an error distribution according to the dynamics of the robot. This makes the method robust to positioning error, significantly reducing the collision rate and increasing the coverage, as shown in the experiments. Several experiments in simulation and with a real mobile manipulator robot with 8 degrees of freedom show that the proposed method provides an effective and fast method for a mobile manipulator to build 3D models of unknown objects. To our knowledge, this is one of the first works that demonstrates the reconstruction of complex objects with a real mobile manipulator considering uncertainty in the controls.}, bibtype = {article}, author = {Vasquez-Gomez, J. Irving and Sucar, L. Enrique and Murrieta-Cid, Rafael}, doi = {10.1007/s10514-015-9531-3}, journal = {Autonomous Robots}, number = {1} }
@article{ title = {Improving multi-view object recognition by detecting changes in point clouds}, type = {article}, year = {2017}, id = {8551249c-0781-372d-a317-9d32513f101a}, created = {2021-01-26T15:00:57.223Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T07:55:33.126Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,13d43b82-d9b4-40a8-9031-8e926a718ef0}, private_publication = {false}, abstract = {This paper proposes the use of change detection in a multi-view object recognition system in order to improve its flexibility and effectiveness in dynamic environments. Multi-view recognition approaches are essential to overcome problems related to clutter, occlusion or camera noise, but the existing systems usually assume a static environment. The presence of dynamic objects raises another issue - the inconsistencies introduced to the internal scene model. We show that by incorporating the change detection and correction of the inherent scene inconsistencies, we reduce false positive detections by 70% in average for moving objects when tested on the publicly available TUW dataset. To reduce time required for verifying a large set of accumulated object pose hypotheses, we further integrate a clustering approach into the original multi-view object recognition system and show that this reduces computation time by 16%.}, bibtype = {article}, author = {Velas, Martin and Faulhammer, Thomas and Spanel, Michal and Zillich, Michael and Vincze, Markus}, doi = {10.1109/SSCI.2016.7850045}, journal = {2016 IEEE Symposium Series on Computational Intelligence, SSCI 2016} }
@article{ title = {ScanNet: Richly-annotated 3D reconstructions of indoor scenes}, type = {article}, year = {2017}, pages = {2432-2443}, volume = {2017-Janua}, id = {0edc9ce5-bde5-3914-8e02-9a4097a5380b}, created = {2021-01-27T10:09:33.305Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-18T10:02:59.816Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {A key requirement for leveraging supervised deep learning methods is the availability of large, labeled datasets. Unfortunately, in the context of RGB-D scene understanding, very little data is available - current datasets cover a small range of scene views and have limited semantic annotations. To address this issue, we introduce ScanNet, an RGB-D video dataset containing 2.5M views in 1513 scenes annotated with 3D camera poses, surface reconstructions, and semantic segmentations. To collect this data, we designed an easy-to-use and scalable RGB-D capture system that includes automated surface reconstruction and crowd-sourced semantic annotation. We show that using this data helps achieve state-of-the-art performance on several 3D scene understanding tasks, including 3D object classification, semantic voxel labeling, and CAD model retrieval.}, bibtype = {article}, author = {Dai, Angela and Chang, Angel X. and Savva, Manolis and Halber, Maciej and Funkhouser, Thomas and Nießner, Matthias}, doi = {10.1109/CVPR.2017.261}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {Semantic3D.Net: a New Large-Scale Point Cloud Classification Benchmark}, type = {article}, year = {2017}, pages = {91-98}, volume = {4}, id = {4010f74e-ecfc-3008-be58-f629d5eaed4b}, created = {2021-01-27T10:09:33.341Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T10:09:40.156Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {This paper presents a new 3D point cloud classification benchmark data set with over four billion manually labelled points, meant as input for data-hungry (deep) learning methods. We also discuss first submissions to the benchmark that use deep convolutional neural networks (CNNs) as a work horse, which already show remarkable performance improvements over state-of-the-art. CNNs have become the de-facto standard for many tasks in computer vision and machine learning like semantic segmentation or object detection in images, but have no yet led to a true breakthrough for 3D point cloud labelling tasks due to lack of training data. With the massive data set presented in this paper, we aim at closing this data gap to help unleash the full potential of deep learning methods for 3D labelling tasks. Our semantic3D.net data set consists of dense point clouds acquired with static terrestrial laser scanners. It contains 8 semantic classes and covers a wide range of urban outdoor scenes: churches, streets, railroad tracks, squares, villages, soccer fields and castles. We describe our labelling interface and show that our data set provides more dense and complete point clouds with much higher overall number of labelled points compared to those already available to the research community. We further provide baseline method descriptions and comparison between methods submitted to our online system. We hope semantic3D.net will pave the way for deep learning methods in 3D point cloud labelling to learn richer, more general 3D representations, and first submissions after only a few months indicate that this might indeed be the case.}, bibtype = {article}, author = {Hackel, T. and Savinov, N. and Ladicky, L. and Wegner, J. D. and Schindler, K. and Pollefeys, M.}, doi = {10.5194/isprs-annals-IV-1-W1-91-2017}, journal = {ISPRS Annals of the Photogrammetry, Remote Sensing and Spatial Information Sciences}, number = {1W1} }
@inproceedings{ title = {PointNet: Deep learning on point sets for 3D classification and segmentation}, type = {inproceedings}, year = {2017}, pages = {77-85}, volume = {2017-Janua}, websites = {https://arxiv.org/abs/1612.00593v2}, month = {11}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {6}, id = {afbbe462-c855-3857-8c79-104411801bfc}, created = {2021-01-27T10:29:02.549Z}, accessed = {2021-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:15:24.433Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Qi2017}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Point cloud is an important type of geometric data structure. Due to its irregular format, most researchers transform such data to regular 3D voxel grids or collections of images. This, however, renders data unnecessarily voluminous and causes issues. In this paper, we design a novel type of neural network that directly consumes point clouds, which well respects the permutation invariance of points in the input. Our network, named PointNet, provides a unified architecture for applications ranging from object classification, part segmentation, to scene semantic parsing. Though simple, PointNet is highly efficient and effective. Empirically, it shows strong performance on par or even better than state of the art. Theoretically, we provide analysis towards understanding of what the network has learnt and why the network is robust with respect to input perturbation and corruption.}, bibtype = {inproceedings}, author = {Qi, Charles R. and Su, Hao and Mo, Kaichun and Guibas, Leonidas J.}, doi = {10.1109/CVPR.2017.16}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space}, type = {article}, year = {2017}, pages = {5100-5109}, volume = {2017-Decem}, websites = {http://arxiv.org/abs/1706.02413}, month = {6}, publisher = {Neural information processing systems foundation}, day = {7}, id = {6a50c7f1-f6a3-3b68-8ca4-5aac273a1638}, created = {2021-01-27T10:30:25.247Z}, accessed = {2021-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-31T07:21:16.757Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Qi2017}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Few prior works study deep learning on point sets. PointNet by Qi et al. is a pioneer in this direction. However, by design PointNet does not capture local structures induced by the metric space points live in, limiting its ability to recognize fine-grained patterns and generalizability to complex scenes. In this work, we introduce a hierarchical neural network that applies PointNet recursively on a nested partitioning of the input point set. By exploiting metric space distances, our network is able to learn local features with increasing contextual scales. With further observation that point sets are usually sampled with varying densities, which results in greatly decreased performance for networks trained on uniform densities, we propose novel set learning layers to adaptively combine features from multiple scales. Experiments show that our network called PointNet++ is able to learn deep point set features efficiently and robustly. In particular, results significantly better than state-of-the-art have been obtained on challenging benchmarks of 3D point clouds.}, bibtype = {article}, author = {Qi, Charles R. and Yi, Li and Su, Hao and Guibas, Leonidas J.}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {3D bounding box estimation using deep learning and geometry}, type = {article}, year = {2017}, pages = {5632-5640}, volume = {2017-Janua}, id = {a8fbaaf9-a4ec-3050-b9d6-a3b9099ce76e}, created = {2021-01-28T07:55:30.085Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T07:55:49.166Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,471f331d-8560-4b9e-b910-e5f849b6fcfd}, private_publication = {false}, abstract = {We present a method for 3D object detection and pose estimation from a single image. In contrast to current techniques that only regress the 3D orientation of an object, our method first regresses relatively stable 3D object properties using a deep convolutional neural network and then combines these estimates with geometric constraints provided by a 2D object bounding box to produce a complete 3D bounding box. The first network output estimates the 3D object orientation using a novel hybrid discrete-continuous loss, which significantly outperforms the L2 loss. The second output regresses the 3D object dimensions, which have relatively little variance compared to alternatives and can often be predicted for many object types. These estimates, combined with the geometric constraints on translation imposed by the 2D bounding box, enable us to recover a stable and accurate 3D object pose. We evaluate our method on the challenging KITTI object detection benchmark [2] both on the official metric of 3D orientation estimation and also on the accuracy of the obtained 3D bounding boxes. Although conceptually simple, our method outperforms more complex and computationally expensive approaches that leverage semantic segmentation, instance level segmentation and flat ground priors [4] and sub-category detection [23][24]. Our discrete-continuous loss also produces state of the art results for 3D viewpoint estimation on the Pascal 3D+ dataset[26].}, bibtype = {article}, author = {Mousavian, Arsalan and Anguelov, Dragomir and Košecká, Jana and Flynn, John}, doi = {10.1109/CVPR.2017.597}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@inproceedings{ title = {OctNet: Learning deep 3D representations at high resolutions}, type = {inproceedings}, year = {2017}, pages = {6620-6629}, volume = {2017-January}, websites = {http://arxiv.org/abs/1611.05009}, month = {11}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {6}, id = {04229a18-4ec0-3d72-a644-3be3e58cc226}, created = {2021-01-28T08:55:31.674Z}, accessed = {2021-01-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T08:55:43.420Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {We present OctNet, a representation for deep learning with sparse 3D data. In contrast to existing models, our representation enables 3D convolutional networks which are both deep and high resolution. Towards this goal, we exploit the sparsity in the input data to hierarchically partition the space using a set of unbalanced octrees where each leaf node stores a pooled feature representation. This allows to focus memory allocation and computation to the relevant dense regions and enables deeper networks without compromising resolution. We demonstrate the utility of our OctNet representation by analyzing the impact of resolution on several 3D tasks including 3D object classification, orientation estimation and point cloud labeling.}, bibtype = {inproceedings}, author = {Riegler, Gernot and Ulusoy, Ali Osman and Geiger, Andreas}, doi = {10.1109/CVPR.2017.701}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@inproceedings{ title = {Multi-view 3D object detection network for autonomous driving}, type = {inproceedings}, year = {2017}, pages = {6526-6534}, volume = {2017-January}, websites = {http://arxiv.org/abs/1611.07759}, month = {11}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {6}, id = {0abe1875-3bba-3441-a5bb-e31af8ec858f}, created = {2021-01-28T08:59:46.649Z}, accessed = {2021-01-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T08:59:50.385Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {This paper aims at high-accuracy 3D object detection in autonomous driving scenario. We propose Multi-View 3D networks (MV3D), a sensory-fusion framework that takes both LIDAR point cloud and RGB images as input and predicts oriented 3D bounding boxes. We encode the sparse 3D point cloud with a compact multi-view representation. The network is composed of two subnetworks: one for 3D object proposal generation and another for multi-view feature fusion. The proposal network generates 3D candidate boxes efficiently from the bird's eye view representation of 3D point cloud. We design a deep fusion scheme to combine region-wise features from multiple views and enable interactions between intermediate layers of different paths. Experiments on the challenging KITTI benchmark show that our approach outperforms the state-of-the-art by around 25% and 30% AP on the tasks of 3D localization and 3D detection. In addition, for 2D detection, our approach obtains 14.9% higher AP than the state-of-the-art on the hard data among the LIDAR-based methods.}, bibtype = {inproceedings}, author = {Chen, Xiaozhi and Ma, Huimin and Wan, Ji and Li, Bo and Xia, Tian}, doi = {10.1109/CVPR.2017.691}, booktitle = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {A review of algorithms for filtering the 3D point cloud}, type = {article}, year = {2017}, keywords = {3D point cloud,Feature-preserving,Filtering methods,Noise reduction}, pages = {103-112}, volume = {57}, month = {9}, publisher = {Elsevier B.V.}, day = {1}, id = {56567d4a-de52-3b4a-9ffd-2201a0e0c443}, created = {2021-01-28T13:47:35.885Z}, accessed = {2021-01-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T13:47:40.286Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {In recent years, 3D point cloud has gained increasing attention as a new representation for objects. However, the raw point cloud is often noisy and contains outliers. Therefore, it is crucial to remove the noise and outliers from the point cloud while preserving the features, in particular, its fine details. This paper makes an attempt to present a comprehensive analysis of the state-of-the-art methods for filtering point cloud. The existing methods are categorized into seven classes, which concentrate on their common and obvious traits. An experimental evaluation is also performed to demonstrate robustness, effectiveness and computational efficiency of several methods used widely in practice.}, bibtype = {article}, author = {Han, Xian Feng and Jin, Jesse S. and Wang, Ming Jie and Jiang, Wei and Gao, Lei and Xiao, Liping}, doi = {10.1016/j.image.2017.05.009}, journal = {Signal Processing: Image Communication} }
@article{ title = {Incremental Network Quantization: Towards Lossless CNNs with Low-Precision Weights}, type = {article}, year = {2017}, websites = {http://arxiv.org/abs/1702.03044}, month = {2}, publisher = {arXiv}, day = {9}, id = {c4bec7ea-7aa3-3775-89b7-8fe706777a5f}, created = {2021-02-09T07:28:11.129Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:52.960Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {This paper presents incremental network quantization (INQ), a novel method, targeting to efficiently convert any pre-trained full-precision convolutional neural network (CNN) model into a low-precision version whose weights are constrained to be either powers of two or zero. Unlike existing methods which are struggled in noticeable accuracy loss, our INQ has the potential to resolve this issue, as benefiting from two innovations. On one hand, we introduce three interdependent operations, namely weight partition, group-wise quantization and re-training. A well-proven measure is employed to divide the weights in each layer of a pre-trained CNN model into two disjoint groups. The weights in the first group are responsible to form a low-precision base, thus they are quantized by a variable-length encoding method. The weights in the other group are responsible to compensate for the accuracy loss from the quantization, thus they are the ones to be re-trained. On the other hand, these three operations are repeated on the latest re-trained group in an iterative manner until all the weights are converted into low-precision ones, acting as an incremental network quantization and accuracy enhancement procedure. Extensive experiments on the ImageNet classification task using almost all known deep CNN architectures including AlexNet, VGG-16, GoogleNet and ResNets well testify the efficacy of the proposed method. Specifically, at 5-bit quantization, our models have improved accuracy than the 32-bit floating-point references. Taking ResNet-18 as an example, we further show that our quantized models with 4-bit, 3-bit and 2-bit ternary weights have improved or very similar accuracy against its 32-bit floating-point baseline. Besides, impressive results with the combination of network pruning and INQ are also reported. The code is available at https://github.com/Zhouaojun/Incremental-Network-Quantization.}, bibtype = {article}, author = {Zhou, Aojun and Yao, Anbang and Guo, Yiwen and Xu, Lin and Chen, Yurong}, journal = {arXiv} }
@article{ title = {Incremental Network Quantization: Towards Lossless CNNs with Low-Precision Weights}, type = {article}, year = {2017}, websites = {http://arxiv.org/abs/1702.03044}, month = {2}, publisher = {arXiv}, day = {9}, id = {7f4db100-3631-3be6-ba65-cb44c0e2acda}, created = {2021-02-09T07:46:34.758Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T14:17:31.156Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {This paper presents incremental network quantization (INQ), a novel method, targeting to efficiently convert any pre-trained full-precision convolutional neural network (CNN) model into a low-precision version whose weights are constrained to be either powers of two or zero. Unlike existing methods which are struggled in noticeable accuracy loss, our INQ has the potential to resolve this issue, as benefiting from two innovations. On one hand, we introduce three interdependent operations, namely weight partition, group-wise quantization and re-training. A well-proven measure is employed to divide the weights in each layer of a pre-trained CNN model into two disjoint groups. The weights in the first group are responsible to form a low-precision base, thus they are quantized by a variable-length encoding method. The weights in the other group are responsible to compensate for the accuracy loss from the quantization, thus they are the ones to be re-trained. On the other hand, these three operations are repeated on the latest re-trained group in an iterative manner until all the weights are converted into low-precision ones, acting as an incremental network quantization and accuracy enhancement procedure. Extensive experiments on the ImageNet classification task using almost all known deep CNN architectures including AlexNet, VGG-16, GoogleNet and ResNets well testify the efficacy of the proposed method. Specifically, at 5-bit quantization, our models have improved accuracy than the 32-bit floating-point references. Taking ResNet-18 as an example, we further show that our quantized models with 4-bit, 3-bit and 2-bit ternary weights have improved or very similar accuracy against its 32-bit floating-point baseline. Besides, impressive results with the combination of network pruning and INQ are also reported. The code is available at https://github.com/Zhouaojun/Incremental-Network-Quantization.}, bibtype = {article}, author = {Zhou, Aojun and Yao, Anbang and Guo, Yiwen and Xu, Lin and Chen, Yurong}, journal = {arXiv} }
@article{ title = {O-CNN: Octree-based Convolutional Neural Networks for 3D Shape Analysis}, type = {article}, year = {2017}, keywords = {Convolutional neural network,Object classification,Octree,Shape retrieval,Shape segmentation}, volume = {36}, websites = {http://arxiv.org/abs/1712.01537,http://dx.doi.org/10.1145/3072959.3073608}, month = {12}, publisher = {Association for Computing Machinery}, day = {5}, id = {260015da-b6e0-3aa0-b376-c584721154a2}, created = {2021-02-09T08:21:04.894Z}, accessed = {2021-02-09}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:21:05.127Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e}, private_publication = {false}, abstract = {We present O-CNN, an Octree-based Convolutional Neural Network (CNN) for 3D shape analysis. Built upon the octree representation of 3D shapes, our method takes the average normal vectors of a 3D model sampled in the finest leaf octants as input and performs 3D CNN operations on the octants occupied by the 3D shape surface. We design a novel octree data structure to efficiently store the octant information and CNN features into the graphics memory and execute the entire O-CNN training and evaluation on the GPU. O-CNN supports various CNN structures and works for 3D shapes in different representations. By restraining the computations on the octants occupied by 3D surfaces, the memory and computational costs of the O-CNN grow quadratically as the depth of the octree increases, which makes the 3D CNN feasible for high-resolution 3D models. We compare the performance of the O-CNN with other existing 3D CNN solutions and demonstrate the efficiency and efficacy of O-CNN in three shape analysis tasks, including object classification, shape retrieval, and shape segmentation.}, bibtype = {article}, author = {Wang, Peng-Shuai and Liu, Yang and Guo, Yu-Xiao and Sun, Chun-Yu and Tong, Xin}, doi = {10.1145/3072959.3073608}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {An Adaptable, Probabilistic, Next-Best View Algorithm for Reconstruction of Unknown 3-D Objects}, type = {article}, year = {2017}, keywords = {Autonomous agents,Motion and path planning,Probability and statistical methods}, pages = {1540-1547}, volume = {2}, publisher = {IEEE}, id = {628416b7-e04a-388b-ab81-f00fc7eeed0c}, created = {2021-02-09T08:36:10.557Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:37:01.826Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {08ef6744-0b10-438e-84da-dc07f1f8ab47}, private_publication = {false}, abstract = {Autonomous mobile robots perform many tasks, such as grasping and inspection, that may require complete models of three-dimensional (3-D) objects in the environment. If little or no knowledge about an object is known a priori, the robot must take sensor measurements from strategically determined viewpoints in order to reconstruct a 3-D model of the object. We propose an autonomous object reconstruction approach for mobile robots that is very general, with no assumptions about object shape or size, such as a bounding box or predetermined set of candidate viewpoints. A probabilistic, volumetric method for determining the optimal next-best view is developed based on a partial model of a 3-D object of unknown shape and size. The proposed method integrates an object probability characteristic to determine sensor views that incrementally reconstruct a 3-D model of the object. Experiments in simulation and on a real-world robot validate the work and compare it to the state of the art.}, bibtype = {article}, author = {Daudelin, Jonathan and Campbell, Mark}, doi = {10.1109/LRA.2017.2660769}, journal = {IEEE Robotics and Automation Letters}, number = {3} }
@article{ title = {Online inspection path planning for autonomous 3D modeling using a micro-aerial vehicle}, type = {article}, year = {2017}, pages = {6217-6224}, publisher = {IEEE}, id = {7e9d5bf4-1811-38bc-94b8-664dd345e761}, created = {2021-02-09T08:36:10.645Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:36:59.365Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {08ef6744-0b10-438e-84da-dc07f1f8ab47}, private_publication = {false}, abstract = {In this paper, we propose a novel algorithm for planning exploration paths to generate 3D models of unknown environments by using a micro-aerial vehicle (MAV). Our algorithm initially determines a next-best-view (NBV) that maximizes information gain and plans a collision-free path to reach the NBV. Along the path, the MAV explores the greatest unknown area although it sometimes misses minor unreconstructed region, such as a hole or a sparse surface. To cover such a region, we propose an online inspection algorithm that consistently provides an optimal coverage path toward the NBV in real time. The algorithm iteratively refines an inspection path according to the acquired information until the modeling of a specific local area is complete. We evaluated the proposed algorithm by comparing it with other state-of-the-art approaches through simulated experiments. The results show that our algorithm outperforms the other approaches in both exploration and 3D modeling scenarios.}, bibtype = {article}, author = {Song, Soohwan and Jo, Sungho}, doi = {10.1109/ICRA.2017.7989737}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {A reinforcement learning approach to the view planning problem}, type = {article}, year = {2017}, pages = {5094-5102}, volume = {2017-Janua}, id = {a5e1358a-1010-3de8-b597-9f6633bbd207}, created = {2021-02-09T17:05:47.289Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:19.185Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {08ef6744-0b10-438e-84da-dc07f1f8ab47}, private_publication = {false}, abstract = {We present a Reinforcement Learning (RL) solution to the view planning problem (VPP), which generates a sequence of view points that are capable of sensing all accessible area of a given object represented as a 3D model. In doing so, the goal is to minimize the number of view points, making the VPP a class of set covering optimization problem (SCOP). The SCOP is NP-hard, and the inapproximability results tell us that the greedy algorithm provides the best approximation that runs in polynomial time. In order to find a solution that is better than the greedy algorithm, (i) we introduce a novel score function by exploiting the geometry of the 3D model, (ii) we device an intuitive approach to VPP using this score function, and (iii) we cast VPP as a Markovian Decision Process (MDP), and solve the MDP in RL framework using well-known RL algorithms. In particular, we use SARSA, Watkins-Q and TD with function approximation to solve the MDP. We compare the results of our method with the baseline greedy algorithm in an extensive set of test objects, and show that we can out-perform the baseline in almost all cases.}, bibtype = {article}, author = {Kaba, Mustafa Devrim and Uzunbas, Mustafa Gokhan and Lim, Ser Nam}, doi = {10.1109/CVPR.2017.541}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {Attention is all you need}, type = {article}, year = {2017}, pages = {5999-6009}, volume = {2017-Decem}, id = {4b5899be-bf05-3461-9911-4912cdf288af}, created = {2021-02-15T14:11:32.377Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.131Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Vaswani2017}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16,4f36a0a5-b08a-4f70-b020-4daf83cb0507,2f2b519d-56f0-4e04-b335-d8e25f087073}, private_publication = {false}, abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature.}, bibtype = {article}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Łukasz and Polosukhin, Illia}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {PCPNET learning local shape properties from raw point clouds}, type = {article}, year = {2017}, volume = {37}, id = {3fefd21c-1a9a-3874-96de-0aab1c0d4ea4}, created = {2021-02-17T09:22:19.390Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-15T12:45:00.176Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper, we propose PCPNET, a deep-learning based approach for estimating local 3D shape properties in point clouds. In contrast to the majority of prior techniques that concentrate on global or mid-level attributes, e.g., for shape classification or semantic labeling, we suggest a patch-based learning method, in which a series of local patches at multiple scales around each point is encoded in a structured manner. Our approach is especially well-adapted for estimating local shape properties such as normals (both unoriented and oriented) and curvature from raw point clouds in the presence of strong noise and multi-scale features. Our main contributions include both a novel multi-scale variant of the recently proposed PointNet architecture with emphasis on local shape information, and a series of novel applications in which we demonstrate how learning from training data arising from well-structured triangle meshes, and applying the trained model to noisy point clouds can produce superior results compared to specialized state-of-the-art techniques. Finally, we demonstrate the utility of our approach in the context of shape reconstruction, by showing how it can be used to extract normal orientation information from point clouds.}, bibtype = {article}, author = {Guerrero, Paul and Kleiman, Yanir and Ovsjanikov, Maks and Mitra, Niloy J.}, journal = {arXiv}, number = {2} }
@article{ title = {Feature pyramid networks for object detection}, type = {article}, year = {2017}, pages = {936-944}, volume = {2017-Janua}, id = {cf953958-61ed-3272-a04f-60f968a7ec10}, created = {2021-02-24T11:29:14.412Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.171Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lin2017}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But recent deep learning object detectors have avoided pyramid representations, in part because they are compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using FPN in a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.}, bibtype = {article}, author = {Lin, Tsung Yi and Dollár, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge}, doi = {10.1109/CVPR.2017.106}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {A review of algorithms for filtering the 3D point cloud}, type = {article}, year = {2017}, keywords = {3D point cloud,Feature-preserving,Filtering methods,Noise reduction}, pages = {103-112}, volume = {57}, websites = {http://dx.doi.org/10.1016/j.image.2017.05.009}, publisher = {Elsevier Ltd}, id = {be3c54ce-f02c-3458-8c19-989013800d29}, created = {2021-03-08T09:43:04.287Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-31T07:21:16.765Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Han2017}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {In recent years, 3D point cloud has gained increasing attention as a new representation for objects. However, the raw point cloud is often noisy and contains outliers. Therefore, it is crucial to remove the noise and outliers from the point cloud while preserving the features, in particular, its fine details. This paper makes an attempt to present a comprehensive analysis of the state-of-the-art methods for filtering point cloud. The existing methods are categorized into seven classes, which concentrate on their common and obvious traits. An experimental evaluation is also performed to demonstrate robustness, effectiveness and computational efficiency of several methods used widely in practice.}, bibtype = {article}, author = {Han, Xian Feng and Jin, Jesse S. and Wang, Ming Jie and Jiang, Wei and Gao, Lei and Xiao, Liping}, doi = {10.1016/j.image.2017.05.009}, journal = {Signal Processing: Image Communication}, number = {February} }
@article{ title = {Depth errors analysis and correction for time-of-flight (ToF) cameras}, type = {article}, year = {2017}, keywords = {Depth error,Error correction,Error modeling,Particle filter,SVM,ToF camera}, volume = {17}, id = {3f40f117-8150-35eb-a0f9-9c3c2f6acb76}, created = {2021-03-25T08:15:16.116Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:52.907Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {Time-of-Flight (ToF) cameras, a technology which has developed rapidly in recent years, are 3D imaging sensors providing a depth image as well as an amplitude image with a high frame rate. As a ToF camera is limited by the imaging conditions and external environment, its captured data are always subject to certain errors. This paper analyzes the influence of typical external distractions including material, color, distance, lighting, etc. on the depth error of ToF cameras. Our experiments indicated that factors such as lighting, color, material, and distance could cause different influences on the depth error of ToF cameras. However, since the forms of errors are uncertain, it’s difficult to summarize them in a unified law. To further improve the measurement accuracy, this paper proposes an error correction method based on Particle Filter-Support Vector Machine (PF-SVM). Moreover, the experiment results showed that this method can effectively reduce the depth error of ToF cameras to 4.6 mm within its full measurement range (0.5–5 m).}, bibtype = {article}, author = {He, Ying and Liang, Bin and Zou, Yu and He, Jin and Yang, Jun}, doi = {10.3390/s17010092}, journal = {Sensors (Switzerland)}, number = {1} }
@article{ title = {US009767598B2}, type = {article}, year = {2017}, id = {bc24ebbc-a7a1-39a2-a33d-3e7a6d95e65d}, created = {2021-04-08T11:26:53.260Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:11.207Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Watson, Mark A} }
@article{ title = {US20170227942A1}, type = {article}, year = {2017}, pages = {2015-2018}, volume = {1}, id = {dd97cd5e-615c-3e15-8c5e-c4d114d78bf7}, created = {2021-04-08T11:26:53.378Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T08:24:38.063Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, abstract = {An actuator (12) includes a moving member (18) pivotally connected to a base unit (14) for rotation about an axis. The driving force for rotation of the moving member (18) relative to the base unit (14) is provided by a pair of antagonistically operating shape memory alloy (SMA) wires (48. 50) and transmitted via a torsional spring (56). An endoscope, or a Snake-like robot (66), may include one or more of the actuators (12).}, bibtype = {article}, author = {David, Mountain}, number = {19} }
@article{ title = {US009626767B2}, type = {article}, year = {2017}, volume = {2}, id = {d90fed66-263b-3719-b926-085fffb9de11}, created = {2021-04-14T07:42:10.388Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:11.214Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Apparatus, Producing and Apparatus, Capturing and Medium, Storage}, number = {12} }
@article{ title = {Incremental Network Quantization: Towards Lossless CNNs with Low-Precision Weights}, type = {article}, year = {2017}, websites = {http://arxiv.org/abs/1702.03044}, month = {2}, publisher = {International Conference on Learning Representations, ICLR}, day = {9}, id = {756ce1dd-51db-32cd-a9e4-92a9f64b7218}, created = {2021-06-14T08:29:03.438Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:31:45.984Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4,cf9189f6-f354-4337-8aaf-a5f12cbf8660}, private_publication = {false}, abstract = {This paper presents incremental network quantization (INQ), a novel method, targeting to efficiently convert any pre-trained full-precision convolutional neural network (CNN) model into a low-precision version whose weights are constrained to be either powers of two or zero. Unlike existing methods which are struggled in noticeable accuracy loss, our INQ has the potential to resolve this issue, as benefiting from two innovations. On one hand, we introduce three interdependent operations, namely weight partition, group-wise quantization and re-training. A well-proven measure is employed to divide the weights in each layer of a pre-trained CNN model into two disjoint groups. The weights in the first group are responsible to form a low-precision base, thus they are quantized by a variable-length encoding method. The weights in the other group are responsible to compensate for the accuracy loss from the quantization, thus they are the ones to be re-trained. On the other hand, these three operations are repeated on the latest re-trained group in an iterative manner until all the weights are converted into low-precision ones, acting as an incremental network quantization and accuracy enhancement procedure. Extensive experiments on the ImageNet classification task using almost all known deep CNN architectures including AlexNet, VGG-16, GoogleNet and ResNets well testify the efficacy of the proposed method. Specifically, at 5-bit quantization, our models have improved accuracy than the 32-bit floating-point references. Taking ResNet-18 as an example, we further show that our quantized models with 4-bit, 3-bit and 2-bit ternary weights have improved or very similar accuracy against its 32-bit floating-point baseline. Besides, impressive results with the combination of network pruning and INQ are also reported. The code is available at https://github.com/Zhouaojun/Incremental-Network-Quantization.}, bibtype = {article}, author = {Zhou, Aojun and Yao, Anbang and Guo, Yiwen and Xu, Lin and Chen, Yurong}, journal = {5th International Conference on Learning Representations, ICLR 2017 - Conference Track Proceedings} }
@article{ title = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications}, type = {article}, year = {2017}, websites = {http://arxiv.org/abs/1704.04861}, month = {4}, day = {16}, id = {4510d9b0-ba51-3dcc-b37c-b3c7a698a6fb}, created = {2021-06-14T08:50:08.167Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:50:13.870Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.}, bibtype = {article}, author = {Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig} }
@article{ title = {A Survey of Model Compression and Acceleration for Deep Neural Networks}, type = {article}, year = {2017}, keywords = {Convolutional Neural Networks,Index Terms-Deep Learning,Model Compression and Acceleration}, websites = {http://arxiv.org/abs/1710.09282}, month = {10}, day = {23}, id = {7295b7c3-2916-3c4a-a7aa-42ce35ed2be9}, created = {2021-06-14T08:58:31.705Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T13:02:25.263Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {Deep neural networks (DNNs) have recently achieved great success in many visual recognition tasks. However, existing deep neural network models are computationally expensive and memory intensive, hindering their deployment in devices with low memory resources or in applications with strict latency requirements. Therefore, a natural thought is to perform model compression and acceleration in deep networks without significantly decreasing the model performance. During the past five years, tremendous progress has been made in this area. In this paper, we review the recent techniques for compacting and accelerating DNN models. In general, these techniques are divided into four categories: parameter pruning and quantization, low-rank factorization, transferred/compact convolutional filters, and knowledge distillation. Methods of parameter pruning and quantization are described first, after that the other techniques are introduced. For each category, we also provide insightful analysis about the performance, related applications, advantages, and drawbacks. Then we go through some very recent successful methods, for example, dynamic capacity networks and stochastic depths networks. After that, we survey the evaluation matrices, the main datasets used for evaluating the model performance, and recent benchmark efforts. Finally, we conclude this paper, discuss remaining the challenges and possible directions for future work.}, bibtype = {article}, author = {Cheng, Yu and Wang, Duo and Zhou, Pan and Zhang, Tao} }
@article{ title = {SegNet: A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation}, type = {article}, year = {2017}, keywords = {Deep convolutional neural networks,decoder,encoder,indoor scenes,pooling,road scenes,semantic pixel-wise segmentation,upsampling}, pages = {2481-2495}, volume = {39}, id = {e8a9cc12-5898-36d6-a570-779efe51475e}, created = {2021-07-01T07:40:22.878Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-01T07:40:31.052Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {We present a novel and practical deep fully convolutional neural network architecture for semantic pixel-wise segmentation termed SegNet. This core trainable segmentation engine consists of an encoder network, a corresponding decoder network followed by a pixel-wise classification layer. The architecture of the encoder network is topologically identical to the 13 convolutional layers in the VGG16 network [1]. The role of the decoder network is to map the low resolution encoder feature maps to full input resolution feature maps for pixel-wise classification. The novelty of SegNet lies is in the manner in which the decoder upsamples its lower resolution input feature map(s). Specifically, the decoder uses pooling indices computed in the max-pooling step of the corresponding encoder to perform non-linear upsampling. This eliminates the need for learning to upsample. The upsampled maps are sparse and are then convolved with trainable filters to produce dense feature maps. We compare our proposed architecture with the widely adopted FCN [2] and also with the well known DeepLab-LargeFOV [3] , DeconvNet [4] architectures. This comparison reveals the memory versus accuracy trade-off involved in achieving good segmentation performance. SegNet was primarily motivated by scene understanding applications. Hence, it is designed to be efficient both in terms of memory and computational time during inference. It is also significantly smaller in the number of trainable parameters than other competing architectures and can be trained end-to-end using stochastic gradient descent. We also performed a controlled benchmark of SegNet and other architectures on both road scenes and SUN RGB-D indoor scene segmentation tasks. These quantitative assessments show that SegNet provides good performance with competitive inference time and most efficient inference memory-wise as compared to other architectures. We also provide a Caffe implementation of SegNet and a web demo at http://mi.eng.cam.ac.uk/projects/segnet/.}, bibtype = {article}, author = {Badrinarayanan, Vijay and Kendall, Alex and Cipolla, Roberto}, doi = {10.1109/TPAMI.2016.2644615}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {12} }
@article{ title = {Geometric deep learning on graphs and manifolds using mixture model CNNs}, type = {article}, year = {2017}, pages = {5425-5434}, volume = {2017-Janua}, id = {ca7a75c6-4e45-3096-922a-fe6ddea6a354}, created = {2021-07-12T10:19:36.481Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:50.224Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Deep learning has achieved a remarkable performance breakthrough in several fields, most notably in speech recognition, natural language processing, and computer vision. In particular, convolutional neural network (CNN) architectures currently produce state-of-the-art performance on a variety of image analysis tasks such as object detection and recognition. Most of deep learning research has so far focused on dealing with 1D, 2D, or 3D Euclidean-structured data such as acoustic signals, images, or videos. Recently, there has been an increasing interest in geometric deep learning, attempting to generalize deep learning methods to non-Euclidean structured data such as graphs and manifolds, with a variety of applications from the domains of network analysis, computational social science, or computer graphics. In this paper, we propose a unified framework allowing to generalize CNN architectures to non-Euclidean domains (graphs and manifolds) and learn local, stationary, and compositional task-specific features. We show that various non-Euclidean CNN methods previously proposed in the literature can be considered as particular instances of our framework. We test the proposed method on standard tasks from the realms of image-, graph-and 3D shape analysis and show that it consistently outperforms previous approaches.}, bibtype = {article}, author = {Monti, Federico and Boscaini, Davide and Masci, Jonathan and Rodolà, Emanuele and Svoboda, Jan and Bronstein, Michael M.}, doi = {10.1109/CVPR.2017.576}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {A simple neural network module for relational reasoning}, type = {article}, year = {2017}, pages = {4968-4977}, volume = {2017-Decem}, id = {6308a55b-5aaf-3182-bf5d-6af502f206f5}, created = {2021-07-12T10:19:36.485Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:51.044Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Relational reasoning is a central component of generally intelligent behavior, but has proven difficult for neural networks to learn. In this paper we describe how to use Relation Networks (RNs) as a simple plug-and-play module to solve problems that fundamentally hinge on relational reasoning. We tested RN-augmented networks on three tasks: visual question answering using a challenging dataset called CLEVR, on which we achieve state-of-the-art, super-human performance; text-based question answering using the bAbI suite of tasks; and complex reasoning about dynamic physical systems. Then, using a curated dataset called Sort-of-CLEVR we show that powerful convolutional networks do not have a general capacity to solve relational questions, but can gain this capacity when augmented with RNs. Thus, by simply augmenting convolutions, LSTMs, and MLPs with RNs, we can remove computational burden from network components that are not well-suited to handle relational reasoning, reduce overall network complexity, and gain a general ability to reason about the relations between entities and their properties.}, bibtype = {article}, author = {Santoro, Adam and Raposo, David and Barrett, David G.T. and Malinowski, Mateusz and Pascanu, Razvan and Battaglia, Peter and Lillicrap, Timothy}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {Semi-supervised classification with graph convolutional networks}, type = {article}, year = {2017}, pages = {1-14}, id = {4f64a519-c19b-3a44-9dbe-d4cce6b45de9}, created = {2021-07-12T10:19:36.588Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.636Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kipf2017}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {We present a scalable approach for semi-supervised learning on graph-structured data that is based on an efficient variant of convolutional neural networks which operate directly on graphs. We motivate the choice of our convolutional architecture via a localized first-order approximation of spectral graph convolutions. Our model scales linearly in the number of graph edges and learns hidden layer representations that encode both local graph structure and features of nodes. In a number of experiments on citation networks and on a knowledge graph dataset we demonstrate that our approach outperforms related methods by a significant margin.}, bibtype = {article}, author = {Kipf, Thomas N. and Welling, Max}, journal = {5th International Conference on Learning Representations, ICLR 2017 - Conference Track Proceedings} }
@article{ title = {Deep sets}, type = {article}, year = {2017}, pages = {3392-3402}, volume = {2017-Decem}, id = {2651a36a-0f6e-3f03-a807-5e1e1ef23798}, created = {2021-07-12T10:19:36.610Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:47.524Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {We study the problem of designing models for machine learning tasks defined on sets. In contrast to traditional approach of operating on fixed dimensional vectors, we consider objective functions defined on sets that are invariant to permutations. Such problems are widespread, ranging from estimation of population statistics [1], to anomaly detection in piezometer data of embankment dams [2], to cosmology [3, 4]. Our main theorem characterizes the permutation invariant functions and provides a family of functions to which any permutation invariant objective function must belong. This family of functions has a special structure which enables us to design a deep network architecture that can operate on sets and which can be deployed on a variety of scenarios including both unsupervised and supervised learning tasks. We also derive the necessary and sufficient conditions for permutation equivariance in deep models. We demonstrate the applicability of our method on population statistic estimation, point cloud classification, set expansion, and outlier detection.}, bibtype = {article}, author = {Zaheer, Manzil and Kottur, Satwik and Ravanbhakhsh, Siamak and Póczos, Barnabás and Salakhutdinov, Ruslan and Smola, Alexander J.}, journal = {Advances in Neural Information Processing Systems}, number = {ii} }
@article{ title = {Tracking the world state with recurrent entity networks}, type = {article}, year = {2017}, pages = {1-15}, id = {dbbe0d5d-43fa-3c75-bcdd-59ecb1cd0548}, created = {2021-07-12T14:15:35.410Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:11.448Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {We introduce a new model, the Recurrent Entity Network (EntNet). It is equipped with a dynamic long-term memory which allows it to maintain and update a representation of the state of the world as it receives new data. For language understanding tasks, it can reason on-the-fly as it reads text, not just when it is required to answer a question or respond as is the case for a Memory Network (Sukhbaatar et al., 2015). Like a Neural Turing Machine or Differentiable Neural Computer (Graves et al., 2014; 2016) it maintains a fixed size memory and can learn to perform location and content-based read and write operations. However, unlike those models it has a simple parallel architecture in which several memory locations can be updated simultaneously. The EntNet sets a new state-of-the-art on the bAbI tasks, and is the first method to solve all the tasks in the 10k training examples setting. We also demonstrate that it can solve a reasoning task which requires a large number of supporting facts, which other methods are not able to solve, and can generalize past its training horizon. It can also be practically used on large scale datasets such as Children's Book Test, where it obtains competitive performance, reading the story in a single pass.}, bibtype = {article}, author = {Henaff, Mikael and Weston, Jason and Szlam, Arthur and Bordes, Antoine and LeCun, Yann}, journal = {5th International Conference on Learning Representations, ICLR 2017 - Conference Track Proceedings} }
@article{ title = {On large-batch training for deep learning: Generalization gap and sharp minima}, type = {article}, year = {2017}, pages = {1-16}, id = {b6d3e8bb-dfe9-3ae1-a518-e9224f5329f8}, created = {2021-07-12T14:15:36.047Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:58.353Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {The stochastic gradient descent (SGD) method and its variants are algorithms of choice for many Deep Learning tasks. These methods operate in a small-batch regime wherein a fraction of the training data, say 32-512 data points, is sampled to compute an approximation to the gradient. It has been observed in practice that when using a larger batch there is a degradation in the quality of the model, as measured by its ability to generalize. We investigate the cause for this generalization drop in the large-batch regime and present numerical evidence that supports the view that large-batch methods tend to converge to sharp minimizers of the training and testing functions-and as is well known, sharp minima lead to poorer generalization. In contrast, small-batch methods consistently converge to flat minimizers, and our experiments support a commonly held view that this is due to the inherent noise in the gradient estimation. We discuss several strategies to attempt to help large-batch methods eliminate this generalization gap.}, bibtype = {article}, author = {Keskar, Nitish Shirish and Nocedal, Jorge and Tang, Ping Tak Peter and Mudigere, Dheevatsa and Smelyanskiy, Mikhail}, journal = {5th International Conference on Learning Representations, ICLR 2017 - Conference Track Proceedings} }
@article{ title = {PointNet++: Deep hierarchical feature learning on point sets in a metric space}, type = {article}, year = {2017}, pages = {5100-5109}, volume = {2017-Decem}, id = {3af6e6fa-0ba5-30d8-b1a8-d5bd5d4fdf00}, created = {2021-07-21T12:55:19.304Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-01T06:54:09.967Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Qi2017}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Few prior works study deep learning on point sets. PointNet [20] is a pioneer in this direction. However, by design PointNet does not capture local structures induced by the metric space points live in, limiting its ability to recognize fine-grained patterns and generalizability to complex scenes. In this work, we introduce a hierarchical neural network that applies PointNet recursively on a nested partitioning of the input point set. By exploiting metric space distances, our network is able to learn local features with increasing contextual scales. With further observation that point sets are usually sampled with varying densities, which results in greatly decreased performance for networks trained on uniform densities, we propose novel set learning layers to adaptively combine features from multiple scales. Experiments show that our network called PointNet++ is able to learn deep point set features efficiently and robustly. In particular, results significantly better than state-of-the-art have been obtained on challenging benchmarks of 3D point clouds.}, bibtype = {article}, author = {Qi, Charles R. and Yi, Li and Su, Hao and Guibas, Leonidas J.}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Spectral graph convolutions for population-based disease prediction}, type = {article}, year = {2017}, pages = {177-185}, volume = {10435 LNCS}, id = {cde2b6b6-dd91-3968-ad67-6f9e754c1d36}, created = {2021-08-04T13:05:08.311Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-04T13:06:02.039Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Exploiting the wealth of imaging and non-imaging information for disease prediction tasks requires models capable of representing, at the same time, individual features as well as data associations between subjects from potentially large populations. Graphs provide a natural framework for such tasks, yet previous graph-based approaches focus on pairwise similarities without modelling the subjects’ individual characteristics and features. On the other hand, relying solely on subject-specific imaging feature vectors fails to model the interaction and similarity between subjects, which can reduce performance. In this paper, we introduce the novel concept of Graph Convolutional Networks (GCN) for brain analysis in populations, combining imaging and non-imaging data. We represent populations as a sparse graph where its vertices are associated with image-based feature vectors and the edges encode phenotypic information. This structure was used to train a GCN model on partially labelled graphs, aiming to infer the classes of unlabelled nodes from the node features and pairwise associations between subjects. We demonstrate the potential of the method on the challenging ADNI and ABIDE databases, as a proof of concept of the benefit from integrating contextual information in classification tasks. This has a clear impact on the quality of the predictions, leading to 69.5% accuracy for ABIDE (outperforming the current state of the art of 66.8%) and 77% for ADNI for prediction of MCI conversion, significantly outperforming standard linear classifiers where only individual features are considered.}, bibtype = {article}, author = {Parisot, Sarah and Ktena, Sofia Ira and Ferrante, Enzo and Lee, Matthew and Moreno, Ricardo Guerrerro and Glocker, Ben and Rueckert, Daniel}, doi = {10.1007/978-3-319-66179-7_21}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {319456} }
@article{ title = {Geometric Deep Learning: Going beyond Euclidean data}, type = {article}, year = {2017}, pages = {18-42}, volume = {34}, publisher = {IEEE}, id = {90008209-cf76-3161-b955-64eacb7e62ba}, created = {2021-08-18T13:14:09.644Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.666Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bronstein2017}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, abstract = {Many scientific fields study data with an underlying structure that is non-Euclidean. Some examples include social networks in computational social sciences, sensor networks in communications, functional networks in brain imaging, regulatory networks in genetics, and meshed surfaces in computer graphics. In many applications, such geometric data are large and complex (in the case of social networks, on the scale of billions) and are natural targets for machine-learning techniques. In particular, we would like to use deep neural networks, which have recently proven to be powerful tools for a broad range of problems from computer vision, natural-language processing, and audio analysis. However, these tools have been most successful on data with an underlying Euclidean or grid-like structure and in cases where the invariances of these structures are built into networks used to model them.}, bibtype = {article}, author = {Bronstein, Michael M. and Bruna, Joan and Lecun, Yann and Szlam, Arthur and Vandergheynst, Pierre}, doi = {10.1109/MSP.2017.2693418}, journal = {IEEE Signal Processing Magazine}, number = {4} }
@article{ title = {Attention Is All You Need}, type = {article}, year = {2017}, pages = {5999-6009}, volume = {2017-December}, websites = {https://arxiv.org/abs/1706.03762v5}, month = {6}, publisher = {Neural information processing systems foundation}, day = {12}, id = {23383771-e6f2-3c3c-9dce-d8cb4fcdf66f}, created = {2021-08-24T07:12:27.272Z}, accessed = {2021-08-24}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-10-06T08:07:38.040Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7,5cd4d7ce-c2fb-4e91-ab80-35deeb123df5}, private_publication = {false}, abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.}, bibtype = {article}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Attention Is All You Need}, type = {article}, year = {2017}, pages = {5999-6009}, volume = {2017-Decem}, websites = {https://arxiv.org/abs/1706.03762v5}, month = {6}, publisher = {Neural information processing systems foundation}, day = {12}, id = {b91a5af1-f071-3548-950b-cf6f477573c1}, created = {2021-08-24T07:14:02.786Z}, accessed = {2021-08-24}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-24T10:23:40.985Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7,c509f25c-b687-4ab5-8859-72131b6658d3}, private_publication = {false}, abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.}, bibtype = {article}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Comparison of fine-tuning and extension strategies for deep convolutional neural networks}, type = {article}, year = {2017}, keywords = {Concept detection,Deep learning,Visual analysis}, pages = {102-114}, volume = {10132 LNCS}, id = {ad371e03-399c-3caf-81b5-dc2565d5d840}, created = {2021-08-28T19:32:57.423Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-28T19:33:14.933Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In this study we compare three different fine-tuning strategies in order to investigate the best way to transfer the parameters of popular deep convolutional neural networks that were trained for a visual annotation task on one dataset, to a new, considerably different dataset. We focus on the concept-based image/video annotation problem and use ImageNet as the source dataset, while the TRECVID SIN 2013 and PAS-CAL VOC-2012 classification datasets are used as the target datasets. A large set of experiments examines the effectiveness of three fine-tuning strategies on each of three different pre-trained DCNNs and each target dataset. The reported results give rise to guidelines for effectively fine-tuning a DCNN for concept-based visual annotation.}, bibtype = {article}, author = {Pittaras, Nikiforos and Markatopoulou, Foteini and Mezaris, Vasileios and Patras, Ioannis}, doi = {10.1007/978-3-319-51811-4_9}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Simonovsky, Komodakis - 2017 - Dynamic Edge-Conditioned Filters in Convolutional Neural Networks on Graphs.pdf}, type = {article}, year = {2017}, pages = {3693-3702}, websites = {http://openaccess.thecvf.com/content_cvpr_2017/papers/Simonovsky_Dynamic_Edge-Conditioned_Filters_CVPR_2017_paper.pdf}, id = {8bd7b731-6765-3bb4-853e-5584cab569cf}, created = {2021-08-29T21:49:16.609Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:20.294Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {07e07de9-bcac-4934-a82b-d0aff540e56d}, private_publication = {false}, abstract = {A number of problems can be formulated as prediction on graph-structured data. In this work, we generalize the convolution operator from regular grids to arbitrary graphs while avoiding the spectral domain, which allows us to handle graphs of varying size and connectivity. To move beyond a simple diffusion, filter weights are conditioned on the specific edge labels in the neighborhood of a vertex. Together with the proper choice of graph coarsening, we explore constructing deep neural networks for graph classification. In particular, we demonstrate the generality of our formulation in point cloud classification, where we set the new state of the art, and on a graph classification dataset, where we outperform other deep learning approaches.}, bibtype = {article}, author = {Simonovsky, Martin and Komodakis, Nikos}, journal = {Cvpr} }
@article{ title = {SyncSpecCNN: Synchronized spectral CNN for 3D shape segmentation}, type = {article}, year = {2017}, pages = {6584-6592}, volume = {2017-Janua}, id = {8f67796d-36f4-343f-9f36-38a71801d156}, created = {2021-08-29T21:49:16.610Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:20.987Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In this paper, we study the problem of semantic annotation on 3D models that are represented as shape graphs. A functional view is taken to represent localized information on graphs, so that annotations such as part segment or keypoint are nothing but 0-1 indicator vertex functions. Compared with images that are 2D grids, shape graphs are irregular and non-isomorphic data structures. To enable the prediction of vertex functions on them by convolutional neural networks, we resort to spectral CNN method that enables weight sharing by parametrizing kernels in the spectral domain spanned by graph Laplacian eigenbases. Under this setting, our network, named SyncSpecCNN, strives to overcome two key challenges: how to share coefficients and conduct multi-scale analysis in different parts of the graph for a single shape, and how to share information across related but different shapes that may be represented by very different graphs. Towards these goals, we introduce a spectral parametrization of dilated convolutional kernels and a spectral transformer network. Experimentally we tested SyncSpecCNN on various tasks, including 3D shape part segmentation and keypoint prediction. State-of-the-art performance has been achieved on all benchmark datasets.}, bibtype = {article}, author = {Yi, Li and Su, Hao and Guo, Xingwen and Guibas, Leonidas}, doi = {10.1109/CVPR.2017.697}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {CONTOUR-ENHANCED RESAMPLING OF 3D POINT CLOUDS VIA GRAPHS Mitsubishi Electric Research Laboratories , Cambridge , MA , USA Dept . of ECE , 3 Dept . of BME , Carnegie Mellon University , Pittsburgh , PA , USA}, type = {article}, year = {2017}, pages = {2941-2945}, id = {d1522a73-260c-391e-a7fb-9a3d07cce656}, created = {2021-08-29T22:27:22.309Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-10T13:41:14.967Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {}, journal = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) 2017} }
@article{ title = {Augmented variational autoencoders for collaborative filtering with auxiliary information}, type = {article}, year = {2017}, keywords = {Collaborative filtering,Deep learning,Generative adversarial networks,Recommender systems,Variational autoencoders}, pages = {1139-1148}, volume = {Part F1318}, id = {099054e4-f921-3450-97b2-11a6d6581df3}, created = {2021-09-09T14:35:21.233Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.878Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Lee2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Recommender systems offer critical services in the age of mass information. A good recommender system selects a certain item for a specific user by recognizing why the user might like the item This awareness implies that the system should model the background of the items and the users .This background modeling for recommendation is tackled through the various models of collaborative filtering with auxiliary information This paper presents variational approaches for collaborative filtering to deal with auxiliary information The proposed methods encompass variational autoencoders through augmenting structures to model the auxiliary information and to model the implicit user feedback This augmentation includes the ladder network and the generative adversarial network to extract the low-dimensional representations influenced by the auxiliary information These two augmentations are the first trial in the venue of the variational autoencoders, and we demonstrate their significant improvement on the performances in the applications of the collaborative filtering.}, bibtype = {article}, author = {Lee, Wonsung and Song, Kyungwoo and Moon, Il Chul}, doi = {10.1145/3132847.3132972}, journal = {International Conference on Information and Knowledge Management, Proceedings} }
@article{ title = {Fully perceptual-based 3D spatial sound individualization with an adaptive variational autoencoder}, type = {article}, year = {2017}, keywords = {3d spatial sound rendering,Deep neural network,Optimization,Sound design in a virtual environment}, volume = {36}, id = {34804194-dfc2-3ac2-b839-6641c7c87094}, created = {2021-09-09T14:35:21.236Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.164Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yamamoto2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {To realize 3D spatial sound rendering with a two-channel headphone, one needs head-related transfer functions (HRTFs) tailored for a specific user. However, measurement of HRTFs requires a tedious and expensive procedure. To address this, we propose a fully perceptual-based HRTF fitting method for individual users using machine learning techniques. The user only needs to answer pairwise comparisons of test signals presented by the system during calibration. This reduces the efforts necessary for the user to obtain individualized HRTFs. Technically, we present a novel adaptive variational AutoEncoder with a convolutional neural network. In the training, this AutoEncoder analyzes publicly available HRTFs dataset and identifies factors that depend on the individuality of users in a nonlinear space. In calibration, the AutoEncoder generates high-quality HRTFs fitted to a specific user by blending the factors. We validate the feasibilities of our method through several quantitative experiments and a user study.}, bibtype = {article}, author = {Yamamoto, Kazuhiko and Igarashi, Takeo}, doi = {10.1145/3130800.3130838}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {Deformable Convolutional Networks}, type = {article}, year = {2017}, pages = {764-773}, volume = {2017-Octob}, id = {7245b395-e44e-3f87-9c82-da8191ec86d7}, created = {2021-10-14T06:49:26.004Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T06:49:32.317Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Convolutional neural networks (CNNs) are inherently limited to model geometric transformations due to the fixed geometric structures in their building modules. In this work, we introduce two new modules to enhance the transformation modeling capability of CNNs, namely, deformable convolution and deformable RoI pooling. Both are based on the idea of augmenting the spatial sampling locations in the modules with additional offsets and learning the offsets from the target tasks, without additional supervision. The new modules can readily replace their plain counterparts in existing CNNs and can be easily trained end-to-end by standard back-propagation, giving rise to deformable convolutional networks. Extensive experiments validate the performance of our approach. For the first time, we show that learning dense spatial transformation in deep CNNs is effective for sophisticated vision tasks such as object detection and semantic segmentation. The code is released at https://github.com/msracver/Deformable-ConvNets.}, bibtype = {article}, author = {Dai, Jifeng and Qi, Haozhi and Xiong, Yuwen and Li, Yi and Zhang, Guodong and Hu, Han and Wei, Yichen}, doi = {10.1109/ICCV.2017.89}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Escape from Cells: Deep Kd-Networks for the Recognition of 3D Point Cloud Models}, type = {article}, year = {2017}, pages = {863-872}, volume = {2017-Octob}, id = {77b58ce4-a232-3174-b882-6ed4f6938639}, created = {2021-10-26T08:17:02.624Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.577Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Klokov2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We present a new deep learning architecture (called Kdnetwork) that is designed for 3D model recognition tasks and works with unstructured point clouds. The new architecture performs multiplicative transformations and shares parameters of these transformations according to the subdivisions of the point clouds imposed onto them by kdtrees. Unlike the currently dominant convolutional architectures that usually require rasterization on uniform twodimensional or three-dimensional grids, Kd-networks do not rely on such grids in any way and therefore avoid poor scaling behavior. In a series of experiments with popular shape recognition benchmarks, Kd-networks demonstrate competitive performance in a number of shape recognition tasks such as shape classification, shape retrieval and shape part segmentation.}, bibtype = {article}, author = {Klokov, Roman and Lempitsky, Victor}, doi = {10.1109/ICCV.2017.99}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {DeepPointSet : A Point Set Generation Network for 3D Object Reconstruction from a Single Image Supplementary Material}, type = {article}, year = {2017}, pages = {605-613}, id = {714c9627-25a2-3ec0-ba9d-2e3607c09f01}, created = {2021-11-01T10:14:38.921Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:38.901Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Fan2017}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,6653d553-53a7-432f-aedf-c70d99c1c5a5,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Fan, Haoqiang and Guibas, Leonidas}, journal = {IEEE Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Multigrid neural architectures}, type = {article}, year = {2017}, pages = {4067-4075}, volume = {2017-Janua}, id = {e5b96ec6-45e1-3e32-b4f3-26bde454b2e0}, created = {2021-11-01T10:14:38.922Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.668Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ke2017}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {We propose a multigrid extension of convolutional neural networks (CNNs). Rather than manipulating representations living on a single spatial grid, our network layers operate across scale space, on a pyramid of grids. They consume multigrid inputs and produce multigrid outputs; convolutional filters themselves have both within-scale and cross-scale extent. This aspect is distinct from simple multiscale designs, which only process the input at different scales. Viewed in terms of information flow, a multigrid network passes messages across a spatial pyramid. As a consequence, receptive field size grows exponentially with depth, facilitating rapid integration of context. Most critically, multigrid structure enables networks to learn internal attention and dynamic routing mechanisms, and use them to accomplish tasks on which modern CNNs fail. Experiments demonstrate wide-ranging performance advantages of multigrid. On CIFAR and ImageNet classification tasks, flipping from a single grid to multigrid within the standard CNN paradigm improves accuracy, while being compute and parameter efficient. Multigrid is independent of other architectural choices; we show synergy in combination with residual connections. Multigrid yields dramatic improvement on a synthetic semantic segmentation dataset. Most strikingly, relatively shallow multigrid networks can learn to directly perform spatial transformation tasks, where, in contrast, current CNNs fail. Together, our results suggest that continuous evolution of features on a multigrid pyramid is a more powerful alternative to existing CNN designs on a flat grid.}, bibtype = {article}, author = {Ke, Tsung Wei and Maire, Michael and Yu, Stella X.}, doi = {10.1109/CVPR.2017.433}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {β -VAE : L EARNING B ASIC V ISUAL C ONCEPTS WITH A C ONSTRAINED V ARIATIONAL F RAMEWORK}, type = {article}, year = {2017}, pages = {1-22}, id = {3ce3ad5b-5eb0-3da5-9c1f-9c0bbf9b989b}, created = {2022-01-03T10:32:08.904Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:08.442Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Higgins2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Higgins, Irina and Matthey, Loic and Pal, Arka and Burgess, Christopher and Glorot, Xavier and Botvinick, Matthew and Mohamed, Shakir and Lerchner, Alexander} }
@article{ title = {Understanding disentangling in β -VAE arXiv : 1804 . 03599v1 [ stat . ML ] 10 Apr 2018}, type = {article}, year = {2017}, id = {c4185ab3-a4dd-3907-be2f-1e63537d3047}, created = {2022-01-03T10:32:09.004Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.224Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Burgess2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Burgess, Christopher P and Higgins, Irina and Pal, Arka and Matthey, Loic and Watters, Nick and Desjardins, Guillaume and Lerchner, Alexander}, number = {Nips} }
@article{ title = {SurfNet: Generating 3D shape surfaces using deep residual networks}, type = {article}, year = {2017}, pages = {791-800}, volume = {2017-Janua}, id = {a2e2ff2c-6fc6-3919-bd7c-6dfa28b6dc29}, created = {2022-01-19T09:08:51.184Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.365Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Sinha2017}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {3D shape models are naturally parameterized using vertices and faces, i.e., composed of polygons forming a surface. However, current 3D learning paradigms for predictive and generative tasks using convolutional neural networks focus on a voxelized representation of the object. Lifting convolution operators from the traditional 2D to 3D results in high computational overhead with little additional benefit as most of the geometry information is contained on the surface boundary. Here we study the problem of directly generating the 3D shape surface of rigid and non-rigid shapes using deep convolutional neural networks. We develop a procedure to create consistent ‘geometry images’ representing the shape surface of a category of 3D objects. We then use this consistent representation for category-specific shape surface generation from a parametric representation or an image by developing novel extensions of deep residual networks for the task of geometry image generation. Our experiments indicate that our network learns a meaningful representation of shape surfaces allowing it to interpolate between shape orientations and poses, invent new shape surfaces and reconstruct 3D shape surfaces from previously unseen images. Our code is available at https://github.com/sinhayan/surfnet.}, bibtype = {article}, author = {Sinha, Ayan and Unmesh, Asim and Huang, Qixing and Ramani, Karthik}, doi = {10.1109/CVPR.2017.91}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {deep learning English version}, type = {article}, year = {2017}, pages = {800}, id = {158a716a-ec51-3952-86e9-f676641cc3a9}, created = {2022-01-19T09:08:51.297Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.230Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Goodfellow2017}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, bibtype = {article}, author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron} }
@article{ title = {Adaptive geometry images for remeshing}, type = {article}, year = {2017}, volume = {2017}, id = {890ec0e3-8d6d-3aa8-b30c-cd4774e16eea}, created = {2022-01-25T12:51:06.346Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.682Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Shi2017}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {Geometry images are a kind of completely regular remeshing methods for mesh representation. Traditional geometry images have difficulties in achieving optimal reconstruction errors and preserving manually selected geometric details, due to the limitations of parametrization methods. To solve two issues, we propose two adaptive geometry images for remeshing triangular meshes. The first scheme produces geometry images with the minimum Hausdorff error by finding the optimization direction for sampling points based on the Hausdorff distance between the original mesh and the reconstructed mesh. The second scheme produces geometry images with higher reconstruction precision over themanually selected region-of-interest of the inputmesh, by increasing the number of sampling points over the region-of-interest. Experimental results show that both schemes give promising results compared with traditional parametrization-based geometry images.}, bibtype = {article}, author = {Shi, Lina and Kong, Dehui and Wang, Shaofan and Yin, Baocai}, doi = {10.1155/2017/2724184}, journal = {International Journal of Digital Multimedia Broadcasting} }
@article{ title = {B-SHOT: a binary 3D feature descriptor for fast Keypoint matching on 3D point clouds}, type = {article}, year = {2017}, keywords = {3D Binary feature descriptors,3D keypoint matching,Binarization,Point cloud registration}, pages = {1501-1520}, volume = {41}, publisher = {IEEE}, id = {33268443-beb7-376c-a49f-c0f486ce5d20}, created = {2022-02-08T07:56:57.276Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.019Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Prakhya2017}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {We present the first attempt in creating a binary 3D feature descriptor for fast and efficient keypoint matching on 3D point clouds. Specifically, we propose a binarization technique and apply it on the state-of-the-art 3D feature descriptor, SHOT (Salti et al., Comput Vision Image Underst 125:251–264, 2014) to create the first binary 3D feature descriptor, which we call B-SHOT. B-SHOT requires 32 times lesser memory for its representation while being six times faster in feature descriptor matching, when compared to the SHOT feature descriptor. Next, we propose a robust evaluation metric, specifically for 3D feature descriptors. A comprehensive evaluation on standard benchmarks reveals that B-SHOT offers comparable keypoint matching performance to that of the state-of-the-art real valued 3D feature descriptors, albeit at dramatically lower computational and memory costs.}, bibtype = {article}, author = {Prakhya, Sai Manoj and Liu, Bingbing and Lin, Weisi and Jakhetiya, Vinit and Guntuku, Sharath Chandra}, doi = {10.1007/s10514-016-9612-y}, journal = {Autonomous Robots}, number = {7} }
@article{ title = {Graph-based Point Cloud Compression Examination Committee :}, type = {article}, year = {2017}, id = {83c15a1a-8635-372d-a22b-357225c6687d}, created = {2022-03-02T07:02:50.234Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T07:03:01.655Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, bibtype = {article}, author = {Jorge, Paulo and Oliveira, Robert De} }
@article{ title = {The shape variational autoencoder: A deep generative model of part-segmented 3D objects}, type = {article}, year = {2017}, keywords = {Categories and Subject Descriptors (according to A,I.3.5 [Computer Graphics]: Computational Geometry}, pages = {1-12}, volume = {36}, id = {beed866b-8e8c-3683-af09-b618012f2321}, created = {2022-03-23T06:17:59.346Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:12.073Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Nash2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We introduce a generative model of part-segmented 3D objects: the shape variational auto-encoder (ShapeVAE). The ShapeVAE describes a joint distribution over the existence of object parts, the locations of a dense set of surface points, and over surface normals associated with these points. Our model makes use of a deep encoder-decoder architecture that leverages the part-decomposability of 3D objects to embed high-dimensional shape representations and sample novel instances. Given an input collection of part-segmented objects with dense point correspondences the ShapeVAE is capable of synthesizing novel, realistic shapes, and by performing conditional inference enables imputation of missing parts or surface normals. In addition, by generating both points and surface normals, our model allows for the use of powerful surface-reconstruction methods for mesh synthesis. We provide a quantitative evaluation of the ShapeVAE on shape-completion and test-set log-likelihood tasks and demonstrate that the model performs favourably against strong baselines. We demonstrate qualitatively that the ShapeVAE produces plausible shape samples, and that it captures a semantically meaningful shape-embedding. In addition we show that the ShapeVAE facilitates mesh reconstruction by sampling consistent surface normals.}, bibtype = {article}, author = {Nash, C. and Williams, C. K.I.}, doi = {10.1111/cgf.13240}, journal = {Computer Graphics Forum}, number = {5} }
@article{ title = {3D Object Reconstruction from a Single Depth View with Adversarial Learning}, type = {article}, year = {2017}, pages = {679-688}, volume = {2018-Janua}, id = {2fc00067-309d-3c28-b870-cb18467edb10}, created = {2022-03-25T07:01:04.927Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.376Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yang2017}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {In this paper, we propose a novel 3D-RecGAN approach, which reconstructs the complete 3D structure of a given object from a single arbitrary depth view using generative adversarial networks. Unlike the existing work which typically requires multiple views of the same object or class labels to recover the full 3D geometry, the proposed 3D-RecGAN only takes the voxel grid representation of a depth view of the object as input, and is able to generate the complete 3D occupancy grid by filling in the occluded/missing regions. The key idea is to combine the generative capabilities of autoencoders and the conditional Generative Adversarial Networks (GAN) framework, to infer accurate and fine-grained 3D structures of objects in high-dimensional voxel space. Extensive experiments on large synthetic datasets show that the proposed 3D-RecGAN significantly outperforms the state of the art in single view 3D object reconstruction, and is able to reconstruct unseen types of objects. Our code and data are available at: https://github.com/Yang7879/3D-RecGAN.}, bibtype = {article}, author = {Yang, Bo and Wen, Hongkai and Wang, Sen and Clark, Ronald and Markham, Andrew and Trigoni, Niki}, doi = {10.1109/ICCVW.2017.86}, journal = {Proceedings - 2017 IEEE International Conference on Computer Vision Workshops, ICCVW 2017} }
@inproceedings{ title = {Improved Training of Wasserstein GANs}, type = {inproceedings}, year = {2017}, volume = {30}, websites = {https://proceedings.neurips.cc/paper/2017/hash/892c3b1c6dccd52936e27cbd0ff683d6-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {66dff937-d760-3be0-a2c1-320669125985}, created = {2022-03-28T09:45:00.683Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:02.579Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gulrajaniImprovedTrainingWasserstein2017}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Gulrajani, Ishaan and Ahmed, Faruk and Arjovsky, Martin and Dumoulin, Vincent and Courville, Aaron C}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {FractalNet: Ultra-Deep Neural Networks without Residuals}, type = {article}, year = {2017}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/1605.07648}, month = {5}, id = {57c38d62-04e4-3b30-93fc-08691526dfb9}, created = {2022-03-28T09:45:01.253Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:36.889Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {larssonFractalNetUltraDeepNeural2017}, source_type = {article}, short_title = {FractalNet}, notes = {arXiv: 1605.07648}, private_publication = {false}, abstract = {We introduce a design strategy for neural network macro-architecture based on self-similarity. Repeated application of a simple expansion rule generates deep networks whose structural layouts are precisely truncated fractals. These networks contain interacting subpaths of different lengths, but do not include any pass-through or residual connections; every internal signal is transformed by a filter and nonlinearity before being seen by subsequent layers. In experiments, fractal networks match the excellent performance of standard residual networks on both CIFAR and ImageNet classification tasks, thereby demonstrating that residual representations may not be fundamental to the success of extremely deep convolutional neural networks. Rather, the key may be the ability to transition, during training, from effectively shallow to deep. We note similarities with student-teacher behavior and develop drop-path, a natural extension of dropout, to regularize co-adaptation of subpaths in fractal architectures. Such regularization allows extraction of high-performance fixed-depth subnetworks. Additionally, fractal networks exhibit an anytime property: shallow subnetworks provide a quick answer, while deeper subnetworks, with higher latency, provide a more accurate answer.}, bibtype = {article}, author = {Larsson, Gustav and Maire, Michael and Shakhnarovich, Gregory}, journal = {arXiv:1605.07648 [cs]} }
@inproceedings{ title = {Regressing Robust and Discriminative 3D Morphable Models With a Very Deep Neural Network}, type = {inproceedings}, year = {2017}, pages = {5163-5172}, websites = {https://openaccess.thecvf.com/content_cvpr_2017/html/Tran_Regressing_Robust_and_CVPR_2017_paper.html}, id = {f73e9f5c-673c-3cf8-b377-5728e5ee3182}, created = {2022-03-28T09:45:01.475Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:18.067Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {tuantranRegressingRobustDiscriminative2017}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Tuan Tran, Anh and Hassner, Tal and Masi, Iacopo and Medioni, Gerard} }
@inproceedings{ title = {Deep Feature Consistent Variational Autoencoder}, type = {inproceedings}, year = {2017}, keywords = {Correlation,Decoding,Face,Feature extraction,Image reconstruction,Loss measurement,Training}, pages = {1133-1141}, month = {3}, id = {1a2e6c52-8abc-3b6f-8922-50c7db47507c}, created = {2022-03-28T09:45:01.609Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:10.350Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {houDeepFeatureConsistent2017}, source_type = {inproceedings}, private_publication = {false}, abstract = {We present a novel method for constructing Variational Autoencoder (VAE). Instead of using pixel-by-pixel loss, we enforce deep feature consistency between the input and the output of a VAE, which ensures the VAE's output to preserve the spatial correlation characteristics of the input, thus leading the output to have a more natural visual appearance and better perceptual quality. Based on recent deep learning works such as style transfer, we employ a pre-trained deep convolutional neural network (CNN) and use its hidden features to define a feature perceptual loss for VAE training. Evaluated on the CelebA face dataset, we show that our model produces better results than other methods in the literature. We also show that our method can produce latent vectors that can capture the semantic information of face expressions and can be used to achieve state-of-the-art performance in facial attribute prediction.}, bibtype = {inproceedings}, author = {Hou, Xianxu and Shen, Linlin and Sun, Ke and Qiu, Guoping}, doi = {10.1109/WACV.2017.131}, booktitle = {2017 IEEE Winter Conference on Applications of Computer Vision (WACV)} }
@article{ title = {Adam: A Method for Stochastic Optimization}, type = {article}, year = {2017}, keywords = {Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1412.6980}, month = {1}, id = {c8029bae-44ee-3b42-b243-8f353e323584}, created = {2022-03-28T09:45:01.812Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:35.181Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kingmaAdamMethodStochastic2017}, source_type = {article}, short_title = {Adam}, notes = {arXiv: 1412.6980}, private_publication = {false}, abstract = {We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.}, bibtype = {article}, author = {Kingma, Diederik P and Ba, Jimmy}, journal = {arXiv:1412.6980 [cs]} }
@inproceedings{ title = {3D Object Reconstruction from a Single Depth View with Adversarial Learning}, type = {inproceedings}, year = {2017}, pages = {679-688}, websites = {https://openaccess.thecvf.com/content_ICCV_2017_workshops/w13/html/Yang_3D_Object_Reconstruction_ICCV_2017_paper.html}, id = {46cf61f2-fa0d-32c1-aa96-74b2b34ae3dd}, created = {2022-03-28T09:45:02.100Z}, accessed = {2021-09-29}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:50.701Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {yang3DObjectReconstruction2017}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Yang, Bo and Wen, Hongkai and Wang, Sen and Clark, Ronald and Markham, Andrew and Trigoni, Niki} }
@inproceedings{ title = {Aff-Wild: Valence and Arousal 'In-The-Wild' Challenge}, type = {inproceedings}, year = {2017}, pages = {34-41}, websites = {https://openaccess.thecvf.com/content_cvpr_2017_workshops/w33/html/Zafeiriou_Aff-Wild_Valence_and_CVPR_2017_paper.html}, id = {93419c7a-7dfc-3783-b160-00849f0c801e}, created = {2022-03-28T09:45:02.439Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:02.736Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zafeiriouAffWildValenceArousal2017}, source_type = {inproceedings}, short_title = {Aff-Wild}, private_publication = {false}, bibtype = {inproceedings}, author = {Zafeiriou, Stefanos and Kollias, Dimitrios and Nicolaou, Mihalis A and Papaioannou, Athanasios and Zhao, Guoying and Kotsia, Irene} }
@inproceedings{ title = {ScanNet: Richly-Annotated 3D Reconstructions of Indoor Scenes}, type = {inproceedings}, year = {2017}, pages = {5828-5839}, websites = {https://openaccess.thecvf.com/content_cvpr_2017/html/Dai_ScanNet_Richly-Annotated_3D_CVPR_2017_paper.html}, id = {e9607a6b-52b2-3106-b684-30cf35513709}, created = {2022-03-28T09:45:02.630Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:40.022Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {daiScanNetRichlyAnnotated3D2017}, source_type = {inproceedings}, short_title = {ScanNet}, private_publication = {false}, bibtype = {inproceedings}, author = {Dai, Angela and Chang, Angel X and Savva, Manolis and Halber, Maciej and Funkhouser, Thomas and Niessner, Matthias} }
@inproceedings{ title = {Learning to Estimate 3D Hand Pose From Single RGB Images}, type = {inproceedings}, year = {2017}, pages = {4903-4911}, websites = {https://openaccess.thecvf.com/content_iccv_2017/html/Zimmermann_Learning_to_Estimate_ICCV_2017_paper.html}, id = {2bf057de-6ca5-356d-8521-cdfe38d3fbb2}, created = {2022-03-28T09:45:02.765Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:18.287Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zimmermannLearningEstimate3D2017}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Zimmermann, Christian and Brox, Thomas} }
@inproceedings{ title = {Unsupervised Anomaly Detection with Generative Adversarial Networks to Guide Marker Discovery}, type = {inproceedings}, year = {2017}, keywords = {Anomaly Detection,Image Patch,Latent Space,Optical Coherence Tomography,Query Image}, pages = {146-157}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {eb8a47e0-b67e-3944-acb4-239483911c67}, created = {2022-03-28T09:45:03.909Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:29.186Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {schleglUnsupervisedAnomalyDetection2017}, source_type = {inproceedings}, private_publication = {false}, abstract = {Obtaining models that capture imaging markers relevant for disease progression and treatment monitoring is challenging. Models are typically based on large amounts of data with annotated examples of known markers aiming at automating detection. High annotation effort and the limitation to a vocabulary of known markers limit the power of such approaches. Here, we perform unsupervised learning to identify anomalies in imaging data as candidates for markers. We propose AnoGAN, a deep convolutional generative adversarial network to learn a manifold of normal anatomical variability, accompanying a novel anomaly scoring scheme based on the mapping from image space to a latent space. Applied to new data, the model labels anomalies, and scores image patches indicating their fit into the learned distribution. Results on optical coherence tomography images of the retina demonstrate that the approach correctly identifies anomalous images, such as images containing retinal fluid or hyperreflective foci.}, bibtype = {inproceedings}, author = {Schlegl, Thomas and Seeböck, Philipp and Waldstein, Sebastian M and Schmidt-Erfurth, Ursula and Langs, Georg}, editor = {Niethammer, Marc and Styner, Martin and Aylward, Stephen and Zhu, Hongtu and Oguz, Ipek and Yap, Pew-Thian and Shen, Dinggang}, doi = {10.1007/978-3-319-59050-9_12}, booktitle = {Information Processing in Medical Imaging} }
@article{ title = {The shape variational autoencoder: A deep generative model of part-segmented 3D objects}, type = {article}, year = {2017}, keywords = {Categories and Subject Descriptors (according to A,I.3.5 Computer Graphics: Computational Geometry a}, pages = {1-12}, volume = {36}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.13240}, id = {c33defb7-4c9f-37e3-96eb-d2995a752e87}, created = {2022-03-28T09:45:03.982Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:34.675Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {nashShapeVariationalAutoencoder2017}, source_type = {article}, short_title = {The shape variational autoencoder}, notes = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.13240}, private_publication = {false}, abstract = {We introduce a generative model of part-segmented 3D objects: the shape variational auto-encoder (ShapeVAE). The ShapeVAE describes a joint distribution over the existence of object parts, the locations of a dense set of surface points, and over surface normals associated with these points. Our model makes use of a deep encoder-decoder architecture that leverages the part-decomposability of 3D objects to embed high-dimensional shape representations and sample novel instances. Given an input collection of part-segmented objects with dense point correspondences the ShapeVAE is capable of synthesizing novel, realistic shapes, and by performing conditional inference enables imputation of missing parts or surface normals. In addition, by generating both points and surface normals, our model allows for the use of powerful surface-reconstruction methods for mesh synthesis. We provide a quantitative evaluation of the ShapeVAE on shape-completion and test-set log-likelihood tasks and demonstrate that the model performs favourably against strong baselines. We demonstrate qualitatively that the ShapeVAE produces plausible shape samples, and that it captures a semantically meaningful shape-embedding. In addition we show that the ShapeVAE facilitates mesh reconstruction by sampling consistent surface normals.}, bibtype = {article}, author = {Nash, C and Williams, C K I}, doi = {10.1111/cgf.13240}, journal = {Computer Graphics Forum}, number = {5} }
@inproceedings{ title = {Least Squares Generative Adversarial Networks}, type = {inproceedings}, year = {2017}, pages = {2794-2802}, websites = {https://openaccess.thecvf.com/content_iccv_2017/html/Mao_Least_Squares_Generative_ICCV_2017_paper.html}, id = {490431ce-8778-3862-9765-b9f3a43b0d8f}, created = {2022-03-28T09:45:03.989Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:15.365Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {maoLeastSquaresGenerative2017}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Mao, Xudong and Li, Qing and Xie, Haoran and Lau, Raymond Y K and Wang, Zhen and Paul Smolley, Stephen} }
@inproceedings{ title = {CVAE-GAN: Fine-Grained Image Generation Through Asymmetric Training}, type = {inproceedings}, year = {2017}, pages = {2745-2754}, websites = {https://openaccess.thecvf.com/content_iccv_2017/html/Bao_CVAE-GAN_Fine-Grained_Image_ICCV_2017_paper.html}, id = {5d9fe24a-7ef8-3573-90ea-ed62834c6177}, created = {2022-03-28T09:45:04.095Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:56.113Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {baoCVAEGANFineGrainedImage2017}, source_type = {inproceedings}, short_title = {CVAE-GAN}, private_publication = {false}, bibtype = {inproceedings}, author = {Bao, Jianmin and Chen, Dong and Wen, Fang and Li, Houqiang and Hua, Gang} }
@article{ title = {The heat method for distance computation}, type = {article}, year = {2017}, pages = {90-99}, volume = {60}, websites = {https://dl.acm.org/doi/10.1145/3131280}, month = {10}, id = {77d00f52-ed76-369c-a45c-5177eea06990}, created = {2022-03-28T09:45:04.223Z}, accessed = {2022-03-17}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:04.223Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {craneHeatMethodDistance2017}, source_type = {article}, private_publication = {false}, abstract = {We introduce the heat method for solving the single- or multiple-source shortest path problem on both flat and curved domains. A key insight is that distance computation can be split into two stages: first find the direction along which distance is increasing, then compute the distance itself. The heat method is robust, efficient, and simple to implement since it is based on solving a pair of standard sparse linear systems. These systems can be factored once and subsequently solved in near-linear time, substantially reducing amortized cost. Real-world performance is an order of magnitude faster than state-of-the-art methods, while maintaining a comparable level of accuracy. The method can be applied in any dimension, and on any domain that admits a gradient and inner product---including regular grids, triangle meshes, and point clouds. Numerical evidence indicates that the method converges to the exact distance in the limit of refinement; we also explore smoothed approximations of distance suitable for applications where greater regularity is desired.}, bibtype = {article}, author = {Crane, Keenan and Weischedel, Clarisse and Wardetzky, Max}, doi = {10.1145/3131280}, journal = {Communications of the ACM}, number = {11} }
@book{ title = {A Mathematical Introduction to Robotic Manipulation}, type = {book}, year = {2017}, month = {9}, publisher = {CRC Press}, city = {Boca Raton}, id = {ecb881bb-6361-3e82-b881-cb85456b501a}, created = {2022-03-28T09:45:04.688Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:04.688Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {murrayMathematicalIntroductionRobotic2017}, source_type = {book}, private_publication = {false}, abstract = {A Mathematical Introduction to Robotic Manipulation presents a mathematical formulation of the kinematics, dynamics, and control of robot manipulators. It uses an elegant set of mathematical tools that emphasizes the geometry of robot motion and allows a large class of robotic manipulation problems to be analyzed within a unified framework. The foundation of the book is a derivation of robot kinematics using the product of the exponentials formula. The authors explore the kinematics of open-chain manipulators and multifingered robot hands, present an analysis of the dynamics and control of robot systems, discuss the specification and control of internal forces and internal motions, and address the implications of the nonholonomic nature of rolling contact are addressed, as well. The wealth of information, numerous examples, and exercises make A Mathematical Introduction to Robotic Manipulation valuable as both a reference for robotics researchers and a text for students in advanced robotics courses.}, bibtype = {book}, author = {Murray, Richard M and Li, Zexiang and Sastry, S Shankar}, doi = {10.1201/9781315136370} }
@inproceedings{ title = {Multigrid Neural Architectures}, type = {inproceedings}, year = {2017}, pages = {6665-6673}, websites = {https://openaccess.thecvf.com/content_cvpr_2017/html/Ke_Multigrid_Neural_Architectures_CVPR_2017_paper.html}, id = {ebb49055-a878-3cbc-8452-a8e22311f5aa}, created = {2022-03-28T09:45:05.005Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:17.635Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {keMultigridNeuralArchitectures2017a}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Ke, Tsung-Wei and Maire, Michael and Yu, Stella X} }
@inproceedings{ title = {CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning}, type = {inproceedings}, year = {2017}, pages = {2901-2910}, websites = {https://openaccess.thecvf.com/content_cvpr_2017/html/Johnson_CLEVR_A_Diagnostic_CVPR_2017_paper.html}, id = {6c26e9a8-1e61-32dd-9905-82d2aaca8e65}, created = {2022-03-28T09:45:05.092Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:30.283Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {johnsonCLEVRDiagnosticDataset2017}, source_type = {inproceedings}, short_title = {CLEVR}, private_publication = {false}, bibtype = {inproceedings}, author = {Johnson, Justin and Hariharan, Bharath and van der Maaten, Laurens and Fei-Fei, Li and Lawrence Zitnick, C and Girshick, Ross} }
@article{ title = {Variational Inference: A Review for Statisticians}, type = {article}, year = {2017}, keywords = {Algorithms,Computationally intensive methods,Statistical computing}, pages = {859-877}, volume = {112}, websites = {https://doi.org/10.1080/01621459.2017.1285773}, month = {4}, id = {f08c8d33-cad3-37d5-8f05-3b79261c883e}, created = {2022-03-28T09:45:05.234Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:21:44.743Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bleiVariationalInferenceReview2017}, source_type = {article}, short_title = {Variational Inference}, notes = {Publisher: Taylor \& Francis<br/>\_eprint: https://doi.org/10.1080/01621459.2017.1285773}, private_publication = {false}, abstract = {One of the core problems of modern statistics is to approximate difficult-to-compute probability densities. This problem is especially important in Bayesian statistics, which frames all inference about unknown quantities as a calculation involving the posterior density. In this article, we review variational inference (VI), a method from machine learning that approximates probability densities through optimization. VI has been used in many applications and tends to be faster than classical methods, such as Markov chain Monte Carlo sampling. The idea behind VI is to first posit a family of densities and then to find a member of that family which is close to the target density. Closeness is measured by Kullback–Leibler divergence. We review the ideas behind mean-field variational inference, discuss the special case of VI applied to exponential family models, present a full example with a Bayesian mixture of Gaussians, and derive a variant that uses stochastic optimization to scale up to massive data. We discuss modern research in VI and highlight important open problems. VI is powerful, but it is not yet well understood. Our hope in writing this article is to catalyze statistical research on this class of algorithms. Supplementary materials for this article are available online.}, bibtype = {article}, author = {Blei, David M and Kucukelbir, Alp and McAuliffe, Jon D}, doi = {10.1080/01621459.2017.1285773}, journal = {Journal of the American Statistical Association}, number = {518} }
@inproceedings{ title = {Attention is All you Need}, type = {inproceedings}, year = {2017}, volume = {30}, websites = {https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {30565572-3a86-3c7c-9327-d152abfed72e}, created = {2022-03-28T09:45:05.577Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:11.727Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {vaswaniAttentionAllYou2017}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Łukasz and Polosukhin, Illia}, booktitle = {Advances in Neural Information Processing Systems} }
@inproceedings{ title = {Wasserstein Generative Adversarial Networks}, type = {inproceedings}, year = {2017}, pages = {214-223}, websites = {https://proceedings.mlr.press/v70/arjovsky17a.html}, month = {7}, publisher = {PMLR}, id = {cfb3971a-a108-3b14-bb1c-edd300d4ff29}, created = {2022-03-28T09:45:05.919Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-09T14:10:26.517Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {arjovskyWassersteinGenerativeAdversarial2017}, source_type = {inproceedings}, notes = {ISSN: 2640-3498}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,6653d553-53a7-432f-aedf-c70d99c1c5a5}, private_publication = {false}, abstract = {We introduce a new algorithm named WGAN, an alternative to traditional GAN training. In this new model, we show that we can improve the stability of learning, get rid of problems like mode collapse, and provide meaningful learning curves useful for debugging and hyperparameter searches. Furthermore, we show that the corresponding optimization problem is sound, and provide extensive theoretical work highlighting the deep connections to different distances between distributions.}, bibtype = {inproceedings}, author = {Arjovsky, Martin and Chintala, Soumith and Bottou, Léon}, booktitle = {Proceedings of the 34th International Conference on Machine Learning} }
@article{ title = {Variational Deep Embedding: An Unsupervised and Generative Approach to Clustering}, type = {article}, year = {2017}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/1611.05148}, month = {6}, id = {8f2d3ddd-27e0-34bd-bd05-a43228872ca6}, created = {2022-03-28T09:45:06.219Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:18.141Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {jiangVariationalDeepEmbedding2017}, source_type = {article}, short_title = {Variational Deep Embedding}, notes = {arXiv: 1611.05148}, private_publication = {false}, abstract = {Clustering is among the most fundamental tasks in computer vision and machine learning. In this paper, we propose Variational Deep Embedding (VaDE), a novel unsupervised generative clustering approach within the framework of Variational Auto-Encoder (VAE). Specifically, VaDE models the data generative procedure with a Gaussian Mixture Model (GMM) and a deep neural network (DNN): 1) the GMM picks a cluster; 2) from which a latent embedding is generated; 3) then the DNN decodes the latent embedding into observables. Inference in VaDE is done in a variational way: a different DNN is used to encode observables to latent embeddings, so that the evidence lower bound (ELBO) can be optimized using Stochastic Gradient Variational Bayes (SGVB) estimator and the reparameterization trick. Quantitative comparisons with strong baselines are included in this paper, and experimental results show that VaDE significantly outperforms the state-of-the-art clustering methods on 4 benchmarks from various modalities. Moreover, by VaDE's generative nature, we show its capability of generating highly realistic samples for any specified cluster, without using supervised information during training. Lastly, VaDE is a flexible and extensible framework for unsupervised generative clustering, more general mixture models than GMM can be easily plugged in.}, bibtype = {article}, author = {Jiang, Zhuxi and Zheng, Yin and Tan, Huachun and Tang, Bangsheng and Zhou, Hanning}, journal = {arXiv:1611.05148 [cs]} }
@inproceedings{ title = {Variational methods for conditional multimodal deep learning}, type = {inproceedings}, year = {2017}, keywords = {Computational modeling,Correlation,Encoding,Gallium nitride,Mathematical model,Neural networks,Training}, pages = {308-315}, month = {5}, id = {c4664424-3d17-34d7-b198-251b9113a1cd}, created = {2022-03-28T09:45:06.342Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:33.704Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {pandeyVariationalMethodsConditional2017}, source_type = {inproceedings}, notes = {ISSN: 2161-4407}, private_publication = {false}, abstract = {In this paper, we address the problem of conditional modality learning, whereby one is interested in generating one modality given the other. While it is straightforward to learn a joint distribution over multiple modalities using a deep multi-modal architecture, we observe that such models are not very effective at conditional generation. Hence, we address the problem by learning conditional distributions between the modalities. We use variational methods for maximizing the corresponding conditional log-likelihood. The resultant deep model, which we refer to as conditional multimodal autoencoder (CMMA), forces the latent representation obtained from a single modality alone to be `close' to the joint representation obtained from multiple modalities. We use the proposed model to generate faces from attributes. We show that the faces generated from attributes using the proposed model are qualitatively and quantitatively more representative of the attributes from which they were generated, than those obtained by other deep generative models. We also propose a secondary task, whereby the existing faces are modified by modifying the corresponding attributes. We observe that the modifications in face introduced by the proposed model are representative of the corresponding modifications in attributes.}, bibtype = {inproceedings}, author = {Pandey, Gaurav and Dukkipati, Ambedkar}, doi = {10.1109/IJCNN.2017.7965870}, booktitle = {2017 International Joint Conference on Neural Networks (IJCNN)} }
@article{ title = {Embodied Hands: Modeling and Capturing Hands and Bodies Together}, type = {article}, year = {2017}, keywords = {Computer Science - Computer Vision and Pattern Rec,Computer Science - Graphics}, pages = {1-17}, volume = {36}, websites = {http://arxiv.org/abs/2201.02610}, month = {11}, id = {93cba662-8d32-336e-92c7-b709e642dc07}, created = {2022-03-28T09:45:06.829Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:02.301Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {romeroEmbodiedHandsModeling2017}, source_type = {article}, short_title = {Embodied Hands}, notes = {arXiv: 2201.02610}, private_publication = {false}, abstract = {Humans move their hands and bodies together to communicate and solve tasks. Capturing and replicating such coordinated activity is critical for virtual characters that behave realistically. Surprisingly, most methods treat the 3D modeling and tracking of bodies and hands separately. Here we formulate a model of hands and bodies interacting together and fit it to full-body 4D sequences. When scanning or capturing the full body in 3D, hands are small and often partially occluded, making their shape and pose hard to recover. To cope with low-resolution, occlusion, and noise, we develop a new model called MANO (hand Model with Articulated and Non-rigid defOrmations). MANO is learned from around 1000 high-resolution 3D scans of hands of 31 subjects in a wide variety of hand poses. The model is realistic, low-dimensional, captures non-rigid shape changes with pose, is compatible with standard graphics packages, and can fit any human hand. MANO provides a compact mapping from hand poses to pose blend shape corrections and a linear manifold of pose synergies. We attach MANO to a standard parameterized 3D body shape model (SMPL), resulting in a fully articulated body and hand model (SMPL+H). We illustrate SMPL+H by fitting complex, natural, activities of subjects captured with a 4D scanner. The fitting is fully automatic and results in full body models that move naturally with detailed hand motions and a realism not seen before in full body performance capture. The models and data are freely available for research purposes in our website (http://mano.is.tue.mpg.de).}, bibtype = {article}, author = {Romero, Javier and Tzionas, Dimitrios and Black, Michael J}, doi = {10.1145/3130800.3130883}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {Feature axes orthogonalization in semantic face editing}, type = {article}, year = {2017}, pages = {163-169}, id = {574947b5-b7fc-3c0c-a9b3-e5bce5e991b6}, created = {2022-04-06T05:19:55.970Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.762Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, bibtype = {article}, author = {Antal, László and Bodó, Zalán} }
@article{ title = {On Convergence and Stability of GANs}, type = {article}, year = {2017}, websites = {http://arxiv.org/abs/1705.07215}, id = {bb5e83de-be82-306e-9cd1-d6c013b4ce1d}, created = {2022-09-08T17:25:32.210Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T13:57:40.656Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kodali2017}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,6653d553-53a7-432f-aedf-c70d99c1c5a5}, private_publication = {false}, abstract = {We propose studying GAN training dynamics as regret minimization, which is in contrast to the popular view that there is consistent minimization of a divergence between real and generated distributions. We analyze the convergence of GAN training from this new point of view to understand why mode collapse happens. We hypothesize the existence of undesirable local equilibria in this non-convex game to be responsible for mode collapse. We observe that these local equilibria often exhibit sharp gradients of the discriminator function around some real data points. We demonstrate that these degenerate local equilibria can be avoided with a gradient penalty scheme called DRAGAN. We show that DRAGAN enables faster training, achieves improved stability with fewer mode collapses, and leads to generator networks with better modeling performance across a variety of architectures and objective functions.}, bibtype = {article}, author = {Kodali, Naveen and Abernethy, Jacob and Hays, James and Kira, Zsolt} }
@article{ title = {Stabilizing training of generative adversarial networks through regularization}, type = {article}, year = {2017}, pages = {2019-2029}, volume = {2017-Decem}, id = {1f0cb62c-a8d2-305c-bb42-b5a9ff4cf20c}, created = {2022-09-08T17:25:32.328Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T13:57:40.893Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Roth2017}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,6653d553-53a7-432f-aedf-c70d99c1c5a5}, private_publication = {false}, abstract = {Deep generative models based on Generative Adversarial Networks (GANs) have demonstrated impressive sample quality but in order to work they require a careful choice of architecture, parameter initialization, and selection of hyper-parameters. This fragility is in part due to a dimensional mismatch or non-overlapping support between the model distribution and the data distribution, causing their density ratio and the associated f-divergence to be undefined. We overcome this fundamental limitation and propose a new regularization approach with low computational cost that yields a stable GAN training procedure. We demonstrate the effectiveness of this regularizer accross several architectures trained on common benchmark image generation tasks. Our regularization turns GAN models into reliable building blocks for deep learning.}, bibtype = {article}, author = {Roth, Kevin and Lucchi, Aurelien and Nowozin, Sebastian and Hofmann, Thomas}, journal = {Advances in Neural Information Processing Systems}, number = {2} }
@article{ title = {Improved Adversarial Systems for 3D Object Generation and Reconstruction}, type = {article}, year = {2017}, keywords = {model learning,multimodal perception,robotic vision}, pages = {1-10}, websites = {http://arxiv.org/abs/1707.09557}, id = {a0fbe967-b3bb-35ff-bd8c-50467123d2a8}, created = {2022-09-08T17:25:32.348Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T13:57:40.663Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Smith2017}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,6653d553-53a7-432f-aedf-c70d99c1c5a5}, private_publication = {false}, abstract = {This paper describes a new approach for training generative adversarial networks (GAN) to understand the detailed 3D shape of objects. While GANs have been used in this domain previously, they are notoriously hard to train, especially for the complex joint data distribution over 3D objects of many categories and orientations. Our method extends previous work by employing the Wasserstein distance normalized with gradient penalization as a training objective. This enables improved generation from the joint object shape distribution. Our system can also reconstruct 3D shape from 2D images and perform shape completion from occluded 2.5D range scans. We achieve notable quantitative improvements in comparison to existing baselines}, bibtype = {article}, author = {Smith, Edward and Meger, David}, number = {CoRL} }
@article{ title = {Shape Inpainting Using 3D Generative Adversarial Network and Recurrent Convolutional Networks}, type = {article}, year = {2017}, pages = {2317-2325}, volume = {2017-Octob}, id = {57266201-56c1-3460-8e35-8aca71c7d004}, created = {2022-09-08T17:25:32.470Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T13:57:40.814Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wang2017}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,6653d553-53a7-432f-aedf-c70d99c1c5a5}, private_publication = {false}, abstract = {Recent advances in convolutional neural networks have shown promising results in 3D shape completion. But due to GPU memory limitations, these methods can only produce low-resolution outputs. To inpaint 3D models with semantic plausibility and contextual details, we introduce a hybrid framework that combines a 3D Encoder-Decoder Generative Adversarial Network (3D-ED-GAN) and a Longterm Recurrent Convolutional Network (LRCN). The 3DED- GAN is a 3D convolutional neural network trained with a generative adversarial paradigm to fill missing 3D data in low-resolution. LRCN adopts a recurrent neural network architecture to minimize GPU memory usage and incorporates an Encoder-Decoder pair into a Long Shortterm Memory Network. By handling the 3D model as a sequence of 2D slices, LRCN transforms a coarse 3D shape into a more complete and higher resolution volume. While 3D-ED-GAN captures global contextual structure of the 3D shape, LRCN localizes the fine-grained details. Experimental results on both real-world and synthetic data show reconstructions from corrupted models result in complete and high-resolution 3D objects.}, bibtype = {article}, author = {Wang, Weiyue and Huang, Qiangui and You, Suya and Yang, Chao and Neumann, Ulrich}, doi = {10.1109/ICCV.2017.252}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Visual SLAM algorithms: A survey from 2010 to 2016}, type = {article}, year = {2017}, keywords = {Augmented reality,Computer vision,Robotics,Survey,Visual SLAM}, volume = {9}, publisher = {IPSJ Transactions on Computer Vision and Applications}, id = {a07b889a-1bbc-3c04-9b96-046bf391a109}, created = {2022-09-13T08:14:28.113Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:34.199Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {SLAM is an abbreviation for simultaneous localization and mapping, which is a technique for estimating sensor motion and reconstructing structure in an unknown environment. Especially, Simultaneous Localization and Mapping (SLAM) using cameras is referred to as visual SLAM (vSLAM) because it is based on visual information only. vSLAM can be used as a fundamental technology for various types of applications and has been discussed in the field of computer vision, augmented reality, and robotics in the literature. This paper aims to categorize and summarize recent vSLAM algorithms proposed in different research communities from both technical and historical points of views. Especially, we focus on vSLAM algorithms proposed mainly from 2010 to 2016 because major advance occurred in that period. The technical categories are summarized as follows: feature-based, direct, and RGB-D camera-based approaches.}, bibtype = {article}, author = {Taketomi, Takafumi and Uchiyama, Hideaki and Ikeda, Sei}, doi = {10.1186/s41074-017-0027-2}, journal = {IPSJ Transactions on Computer Vision and Applications} }
@article{ title = {Shape generation using spatially partitioned point clouds}, type = {article}, year = {2017}, pages = {1-12}, id = {d3927a18-68ab-3d91-a991-2d296b487802}, created = {2022-09-21T09:29:20.054Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-21T09:29:42.236Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,6653d553-53a7-432f-aedf-c70d99c1c5a5}, private_publication = {false}, abstract = {We propose a method to generate 3D shapes using point clouds. Given a point-cloud representation of a 3D shape, our method builds a kd-tree to spatially partition the points. This orders them consistently across all shapes, resulting in reasonably good correspondences across all shapes. We then use PCA analysis to derive a linear shape basis across the spatially partitioned points, and optimize the point ordering by iteratively minimizing the PCA reconstruction error. Even with the spatial sorting, the point clouds are inherently noisy and the resulting distribution over the shape coefficients can be highly multi-modal. We propose to use the expressive power of neural networks to learn a distribution over the shape coefficients in a generative-adversarial framework. Compared to 3D shape generative models trained on voxel-representations, our point-based method is considerably more light-weight and scalable, with little loss of quality. It also outperforms simpler linear factor models such as Probabilistic PCA, both qualitatively and quantitatively, on a number of categories from the ShapeNet dataset. Furthermore, our method can easily incorporate other point attributes such as normal and color information, an additional advantage over voxel-based representations.}, bibtype = {article}, author = {Gadelha, Matheus and Maji, Subhransu and Wang, Rui}, doi = {10.5244/c.31.54}, journal = {British Machine Vision Conference 2017, BMVC 2017} }
@article{ title = {A novel scan registration method based on the feature-less global descriptor - Spherical entropy image}, type = {article}, year = {2017}, keywords = {Scan registration,Shape descriptor,Spherical entropy image}, pages = {552-563}, volume = {44}, id = {d110b7f5-121a-38d1-b886-d319db311bfd}, created = {2023-05-03T13:16:39.077Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.420Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Sun2017}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Purpose: This paper aims to present the spherical entropy image (SEI), a novel global descriptor for the scan registration of three-dimensional (3D) point clouds. This paper also introduces a global feature-less scan registration strategy based on SEI. It is advantageous for 3D data processing in the scenarios such as mobile robotics and reverse engineering. Design/methodology/approach: The descriptor works through representing the scan by a spherical function named SEI, whose properties allow to decompose the six-dimensional transformation into 3D rotation and 3D translation. The 3D rotation is estimated by the generalized convolution theorem based on the spherical Fourier transform of SEI. Then, the translation recovery is determined by phase only matched filtering. Findings: No explicit features and planar segments should be contained in the input data of the method. The experimental results illustrate the parameter independence, high reliability and efficiency of the novel algorithm in registration of feature-less scans. Originality/value: A novel global descriptor (SEI) for the scan registration of 3D point clouds is presented. It inherits both descriptive power of signature-based methods and robustness of histogram-based methods. A high reliability and efficiency registration method of scans based on SEI is also demonstrated.}, bibtype = {article}, author = {Sun, Bo and Zeng, Yadan and Dai, Houde and Xiao, Junhao and Zhang, Jianwei}, doi = {10.1108/IR-11-2016-0329}, journal = {Industrial Robot}, number = {4} }
@article{ title = {An iterative closest points algorithm for registration of 3D laser scanner point clouds with geometric features}, type = {article}, year = {2017}, keywords = {Geometric features,ICP registration,Point clouds}, volume = {17}, id = {41e94725-c588-33c2-8c5d-cf977e2ee2cc}, created = {2023-05-03T13:16:39.265Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.038Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {He2017}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {The Iterative Closest Points (ICP) algorithm is the mainstream algorithm used in the process of accurate registration of 3D point cloud data. The algorithm requires a proper initial value and the approximate registration of two point clouds to prevent the algorithm from falling into local extremes, but in the actual point cloud matching process, it is difficult to ensure compliance with this requirement. In this paper, we proposed the ICP algorithm based on point cloud features (GF-ICP). This method uses the geometrical features of the point cloud to be registered, such as curvature, surface normal and point cloud density, to search for the correspondence relationships between two point clouds and introduces the geometric features into the error function to realize the accurate registration of two point clouds. The experimental results showed that the algorithm can improve the convergence speed and the interval of convergence without setting a proper initial value.}, bibtype = {article}, author = {He, Ying and Liang, Bin and Yang, Jun and Li, Shunzhi and He, Jin}, doi = {10.3390/s17081862}, journal = {Sensors (Switzerland)}, number = {8} }
@article{ title = {Deep Cuboid Detection: Beyond 2D Bounding Boxes}, type = {article}, year = {2016}, websites = {http://arxiv.org/abs/1611.10010}, id = {84d98484-ec63-3494-aae3-1ca7c2ab08b3}, created = {2020-09-14T08:14:53.498Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-27T07:13:09.698Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, abstract = {We present a Deep Cuboid Detector which takes a consumer-quality RGB image of a cluttered scene and localizes all 3D cuboids (box-like objects). Contrary to classical approaches which fit a 3D model from low-level cues like corners, edges, and vanishing points, we propose an end-to-end deep learning system to detect cuboids across many semantic categories (e.g., ovens, shipping boxes, and furniture). We localize cuboids with a 2D bounding box, and simultaneously localize the cuboid's corners, effectively producing a 3D interpretation of box-like objects. We refine keypoints by pooling convolutional features iteratively, improving the baseline method significantly. Our deep learning cuboid detector is trained in an end-to-end fashion and is suitable for real-time applications in augmented reality (AR) and robotics.}, bibtype = {article}, author = {Dwibedi, Debidatta and Malisiewicz, Tomasz and Badrinarayanan, Vijay and Rabinovich, Andrew} }
@article{ title = {Automatic scene parsing for generic object descriptions using shape primitives}, type = {article}, year = {2016}, keywords = {3D models,Sample consensus,Shape primitives,Task specification}, pages = {93-112}, volume = {76}, websites = {http://dx.doi.org/10.1016/j.robot.2015.11.003}, publisher = {Elsevier B.V.}, id = {70c2929f-c375-3ab9-85c1-1912d7f7e38d}, created = {2020-09-14T08:14:53.647Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-27T07:13:10.059Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, abstract = {Autonomous robots need to generate complete 3D models from a limited range of view when trying to manipulate objects for which no model is known a priori. This can be achieved by detecting symmetrical parts of an object, thus, creating an estimate of the invisible back sides. These symmetrical parts are typically modeled as primitive shapes (cylinders, spheres, cones, etc.), and fitted to noisy sensor data using sample consensus methods. This has the advantage that feasible grasps can be chosen from a precomputed set based on the estimated model, instead of a time-consuming random sampling approach. This article will look at fitting such analytic models to noisy 3D data in the context of robotic manipulation. State of the art methods from the Point Cloud Library (PCL) were extended to include additional relevant shapes (e.g. boxes), constraints (e.g. on size and orientation), and to consider additional information like knowledge about free space or proprioceptive information. A core part of the approach is the development of a scene parsing language, that allows for an easy-to-use pipeline specification during autonomous operation as well as shared-autonomy scenarios. Experiments will be presented based on scenes captured using an Xtion sensor.}, bibtype = {article}, author = {Büttner, Stefan and Márton, Zoltán Csaba and Hertkorn, Katharina}, doi = {10.1016/j.robot.2015.11.003}, journal = {Robotics and Autonomous Systems} }
@article{ title = {Learning to remove multipath distortions in Time-of-Flight range images for a robotic arm setup}, type = {article}, year = {2016}, pages = {3390-3397}, volume = {2016-June}, id = {366ab6bd-861d-3a0d-babb-f769951bc074}, created = {2020-11-05T08:16:34.349Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:54.114Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {Range images captured by Time-of-Flight (ToF) cameras are corrupted with multipath distortions due to interaction between modulated light signals and scenes. The interaction is often complicated, which makes a model-based solution elusive. We propose a learning-based approach for removing the multipath distortions for a ToF camera in a robotic arm setup. Our approach is based on deep learning. We use the robotic arm to automatically collect a large amount of ToF range images containing various multipath distortions. The training images are automatically labeled by leveraging a high precision structured light sensor available only in the training time. In the test time, we apply the learned model to remove the multipath distortions. This allows our robotic arm setup to enjoy the speed and compact form of the ToF camera without compromising with its range measurement errors. We conduct extensive experimental validations and compare the proposed method to several baseline algorithms. The experiment results show that our method achieves 55% error reduction in range estimation and largely outperforms the baseline algorithms.}, bibtype = {article}, author = {Son, Kilho and Liu, Ming Yu and Taguchi, Yuichi}, doi = {10.1109/ICRA.2016.7487515}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {Convolutional oriented boundaries}, type = {article}, year = {2016}, keywords = {Contour detection,Contour orientation estimation,Hierarchical image segmentation,Object proposals}, pages = {580-596}, volume = {9905 LNCS}, id = {6a111334-8b9d-32cb-8ea1-04b80b1eaa14}, created = {2020-11-13T11:34:36.740Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T07:51:59.145Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We present Convolutional Oriented Boundaries (COB), which produces multiscale oriented contours and region hierarchies starting from generic image classification Convolutional Neural Networks (CNNs). COB is computationally efficient, because it requires a single CNN forward pass for contour detection and it uses a novel sparse boundary representation for hierarchical segmentation; it gives a significant leap in performance over the state-of-the-art, and it generalizes very well to unseen categories and datasets. Particularly, we show that learning to estimate not only contour strength but also orientation provides more accurate results. We perform extensive experiments on BSDS, PASCAL Context, PASCAL Segmentation, and MS-COCO, showing that COB provides state-of-the-art contours, region hierarchies, and object proposals in all datasets.}, bibtype = {article}, author = {Maninis, Kevis Kokitsi and Pont-Tuset, Jordi and Arbeláez, Pablo and Van Gool, Luc}, doi = {10.1007/978-3-319-46448-0_35}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Deep learning for robust normal estimation in unstructured point clouds}, type = {article}, year = {2016}, pages = {281-290}, volume = {35}, id = {2573766f-f90d-3007-8ddc-ce84cc0d7ee6}, created = {2020-11-13T11:34:36.859Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:35:28.289Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Normal estimation in point clouds is a crucial first step for numerous algorithms, from surface reconstruction and scene understanding to rendering. A recurrent issue when estimating normals is to make appropriate decisions close to sharp features, not to smooth edges, or when the sampling density is not uniform, to prevent bias. Rather than resorting to manually-designed geometric priors, we propose to learn how to make these decisions, using ground-truth data made from synthetic scenes. For this, we project a discretized Hough space representing normal directions onto a structure amenable to deep learning. The resulting normal estimation method outperforms most of the time the state of the art regarding robustness to outliers, to noise and to point density variation, in the presence of sharp edges, while remaining fast, scaling up to millions of points.}, bibtype = {article}, author = {Boulch, Alexandre and Marlet, Renaud}, doi = {10.1111/cgf.12983}, journal = {Eurographics Symposium on Geometry Processing}, number = {5} }
@article{ title = {Deeper depth prediction with fully convolutional residual networks}, type = {article}, year = {2016}, keywords = {CNN,Depth prediction}, pages = {239-248}, id = {728305d5-6e7d-3893-a111-d47ab223136a}, created = {2020-11-24T10:01:25.868Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-25T07:49:20.038Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {This paper addresses the problem of estimating the depth map of a scene given a single RGB image. We propose a fully convolutional architecture, encompassing residual learning, to model the ambiguous mapping between monocular images and depth maps. In order to improve the output resolution, we present a novel way to efficiently learn feature map up-sampling within the network. For optimization, we introduce the reverse Huber loss that is particularly suited for the task at hand and driven by the value distributions commonly present in depth maps. Our model is composed of a single architecture that is trained end-to-end and does not rely on post-processing techniques, such as CRFs or other additional refinement steps. As a result, it runs in real-time on images or videos. In the evaluation, we show that the proposed model contains fewer parameters and requires fewer training data than the current state of the art, while outperforming all approaches on depth estimation. Code and models are publicly available.}, bibtype = {article}, author = {Laina, Iro and Rupprecht, Christian and Belagiannis, Vasileios and Tombari, Federico and Navab, Nassir}, doi = {10.1109/3DV.2016.32}, journal = {Proceedings - 2016 4th International Conference on 3D Vision, 3DV 2016} }
@article{ title = {Volumetric and multi-view CNNs for object classification on 3D data}, type = {article}, year = {2016}, pages = {5648-5656}, volume = {2016-Decem}, id = {6531dd49-7da4-3bcc-aaca-ff8f6ac2d4ee}, created = {2021-01-25T08:45:25.224Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-03T10:12:46.627Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {3D shape models are becoming widely available and easier to capture, making available 3D information crucial for progress in object classification. Current state-of-theart methods rely on CNNs to address this problem. Recently, we witness two types of CNNs being developed: CNNs based upon volumetric representations versus CNNs based upon multi-view representations. Empirical results from these two types of CNNs exhibit a large gap, indicating that existing volumetric CNN architectures and approaches are unable to fully exploit the power of 3D representations. In this paper, we aim to improve both volumetric CNNs and multi-view CNNs according to extensive analysis of existing approaches. To this end, we introduce two distinct network architectures of volumetric CNNs. In addition, we examine multi-view CNNs, where we introduce multiresolution filtering in 3D. Overall, we are able to outperform current state-of-the-art methods for both volumetric CNNs and multi-view CNNs. We provide extensive experiments designed to evaluate underlying design choices, thus providing a better understanding of the space of methods available for object classification on 3D data.}, bibtype = {article}, author = {Qi, Charles R. and Su, Hao and Niebner, Matthias and Dai, Angela and Yan, Mengyuan and Guibas, Leonidas J.}, doi = {10.1109/CVPR.2016.609}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {FusionNet: 3D Object Classification Using Multiple Data Representations}, type = {article}, year = {2016}, websites = {http://arxiv.org/abs/1607.05695}, id = {fe148041-386d-3e2f-a157-a8a2414c122d}, created = {2021-01-25T08:45:25.228Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T09:02:50.626Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {High-quality 3D object recognition is an important component of many vision and robotics systems. We tackle the object recognition problem using two data representations, to achieve leading results on the Princeton ModelNet challenge. The two representations: 1. Volumetric representation: the 3D object is discretized spatially as binary voxels - $1$ if the voxel is occupied and $0$ otherwise. 2. Pixel representation: the 3D object is represented as a set of projected 2D pixel images. Current leading submissions to the ModelNet Challenge use Convolutional Neural Networks (CNNs) on pixel representations. However, we diverge from this trend and additionally, use Volumetric CNNs to bridge the gap between the efficiency of the above two representations. We combine both representations and exploit them to learn new features, which yield a significantly better classifier than using either of the representations in isolation. To do this, we introduce new Volumetric CNN (V-CNN) architectures.}, bibtype = {article}, author = {Hegde, Vishakh and Zadeh, Reza} }
@inproceedings{ title = {3D U-net: Learning dense volumetric segmentation from sparse annotation}, type = {inproceedings}, year = {2016}, keywords = {3D,Biomedical volumetric image segmentation,Convolutional neural networks,Fully-automated,Semi-automated,Sparse annotation,Xenopus kidney}, pages = {424-432}, volume = {9901 LNCS}, websites = {http://arxiv.org/abs/1606.06650}, month = {6}, publisher = {Springer Verlag}, day = {21}, id = {de1e5f73-7259-33b0-a147-71ea87b0f50b}, created = {2021-01-29T12:20:34.183Z}, accessed = {2021-01-29}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-29T12:20:36.794Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {This paper introduces a network for volumetric segmentation that learns from sparsely annotated volumetric images. We outline two attractive use cases of this method: (1) In a semi-automated setup,the user annotates some slices in the volume to be segmented. The network learns from these sparse annotations and provides a dense 3D segmentation. (2) In a fully-automated setup,we assume that a representative,sparsely annotated training set exists. Trained on this data set,the network densely segments new volumetric images. The proposed network extends the previous u-net architecture from Ronneberger et al. by replacing all 2D operations with their 3D counterparts. The implementation performs on-the-fly elastic deformations for efficient data augmentation during training. It is trained end-to-end from scratch,i.e.,no pre-trained network is required. We test the performance of the proposed method on a complex,highly variable 3D structure,the Xenopus kidney,and achieve good results for both use cases.}, bibtype = {inproceedings}, author = {Çiçek, Özgün and Abdulkadir, Ahmed and Lienkamp, Soeren S. and Brox, Thomas and Ronneberger, Olaf}, doi = {10.1007/978-3-319-46723-8_49}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding}, type = {inproceedings}, year = {2016}, websites = {https://arxiv.org/abs/1510.00149v5}, month = {10}, publisher = {International Conference on Learning Representations, ICLR}, day = {1}, id = {230d1a3f-7e86-301c-ab9c-88873a61011d}, created = {2021-02-09T07:24:51.581Z}, accessed = {2021-02-09}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:56.698Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources. To address this limitation, we introduce “deep compression”, a three stage pipeline: pruning, trained quantization and Huffman coding, that work together to reduce the storage requirement of neural networks by 35× to 49× without affecting their accuracy. Our method first prunes the network by learning only the important connections. Next, we quantize the weights to enforce weight sharing, finally, we apply Huffman coding. After the first two steps we retrain the network to fine tune the remaining connections and the quantized centroids. Pruning, reduces the number of connections by 9× to 13×; Quantization then reduces the number of bits that represent each connection from 32 to 5. On the ImageNet dataset, our method reduced the storage required by AlexNet by 35×, from 240MB to 6.9MB, without loss of accuracy. Our method reduced the size of VGG-16 by 49× from 552MB to 11.3MB, again with no loss of accuracy. This allows fitting the model into on-chip SRAM cache rather than off-chip DRAM memory. Our compression method also facilitates the use of complex neural networks in mobile applications where application size and download bandwidth are constrained. Benchmarked on CPU, GPU and mobile GPU, compressed network has 3× to 4× layerwise speedup and 3× to 7× better energy efficiency.}, bibtype = {inproceedings}, author = {Han, Song and Mao, Huizi and Dally, William J.}, booktitle = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings} }
@article{ title = {SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size}, type = {article}, year = {2016}, websites = {http://arxiv.org/abs/1602.07360}, month = {2}, day = {23}, id = {8b34d6cf-1f21-3a5b-9fd7-903a07f835bf}, created = {2021-02-09T07:30:01.222Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:54.717Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Recent research on deep neural networks has focused primarily on improving accuracy. For a given accuracy level, it is typically possible to identify multiple DNN architectures that achieve that accuracy level. With equivalent accuracy, smaller DNN architectures offer at least three advantages: (1) Smaller DNNs require less communication across servers during distributed training. (2) Smaller DNNs require less bandwidth to export a new model from the cloud to an autonomous car. (3) Smaller DNNs are more feasible to deploy on FPGAs and other hardware with limited memory. To provide all of these advantages, we propose a small DNN architecture called SqueezeNet. SqueezeNet achieves AlexNet-level accuracy on ImageNet with 50x fewer parameters. Additionally, with model compression techniques we are able to compress SqueezeNet to less than 0.5MB (510x smaller than AlexNet). The SqueezeNet architecture is available for download here: https://github.com/DeepScale/SqueezeNet}, bibtype = {article}, author = {Iandola, Forrest N. and Han, Song and Moskewicz, Matthew W. and Ashraf, Khalid and Dally, William J. and Keutzer, Kurt} }
@inproceedings{ title = {Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding}, type = {inproceedings}, year = {2016}, websites = {https://arxiv.org/abs/1510.00149v5}, month = {10}, publisher = {International Conference on Learning Representations, ICLR}, day = {1}, id = {d0aadc3f-65ad-386b-9607-17b7d950b216}, created = {2021-02-09T07:45:20.595Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-10T08:39:02.870Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources. To address this limitation, we introduce “deep compression”, a three stage pipeline: pruning, trained quantization and Huffman coding, that work together to reduce the storage requirement of neural networks by 35× to 49× without affecting their accuracy. Our method first prunes the network by learning only the important connections. Next, we quantize the weights to enforce weight sharing, finally, we apply Huffman coding. After the first two steps we retrain the network to fine tune the remaining connections and the quantized centroids. Pruning, reduces the number of connections by 9× to 13×; Quantization then reduces the number of bits that represent each connection from 32 to 5. On the ImageNet dataset, our method reduced the storage required by AlexNet by 35×, from 240MB to 6.9MB, without loss of accuracy. Our method reduced the size of VGG-16 by 49× from 552MB to 11.3MB, again with no loss of accuracy. This allows fitting the model into on-chip SRAM cache rather than off-chip DRAM memory. Our compression method also facilitates the use of complex neural networks in mobile applications where application size and download bandwidth are constrained. Benchmarked on CPU, GPU and mobile GPU, compressed network has 3× to 4× layerwise speedup and 3× to 7× better energy efficiency.}, bibtype = {inproceedings}, author = {Han, Song and Mao, Huizi and Dally, William J.}, booktitle = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings} }
@article{ title = {SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size}, type = {article}, year = {2016}, websites = {http://arxiv.org/abs/1602.07360}, month = {2}, day = {23}, id = {7c0e3a50-3d91-37d4-a0d3-253186d8e6a4}, created = {2021-02-09T07:46:50.260Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T14:17:31.158Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Recent research on deep neural networks has focused primarily on improving accuracy. For a given accuracy level, it is typically possible to identify multiple DNN architectures that achieve that accuracy level. With equivalent accuracy, smaller DNN architectures offer at least three advantages: (1) Smaller DNNs require less communication across servers during distributed training. (2) Smaller DNNs require less bandwidth to export a new model from the cloud to an autonomous car. (3) Smaller DNNs are more feasible to deploy on FPGAs and other hardware with limited memory. To provide all of these advantages, we propose a small DNN architecture called SqueezeNet. SqueezeNet achieves AlexNet-level accuracy on ImageNet with 50x fewer parameters. Additionally, with model compression techniques we are able to compress SqueezeNet to less than 0.5MB (510x smaller than AlexNet). The SqueezeNet architecture is available for download here: https://github.com/DeepScale/SqueezeNet}, bibtype = {article}, author = {Iandola, Forrest N. and Han, Song and Moskewicz, Matthew W. and Ashraf, Khalid and Dally, William J. and Keutzer, Kurt} }
@inproceedings{ title = {3D U-net: Learning dense volumetric segmentation from sparse annotation}, type = {inproceedings}, year = {2016}, keywords = {3D,Biomedical volumetric image segmentation,Convolutional neural networks,Fully-automated,Semi-automated,Sparse annotation,Xenopus kidney}, pages = {424-432}, volume = {9901 LNCS}, websites = {http://arxiv.org/abs/1606.06650}, month = {6}, publisher = {Springer Verlag}, day = {21}, id = {95cdb5c6-8d61-3d21-832e-8a2f4b233bfa}, created = {2021-02-09T08:08:53.249Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:08:56.248Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e}, private_publication = {false}, abstract = {This paper introduces a network for volumetric segmentation that learns from sparsely annotated volumetric images. We outline two attractive use cases of this method: (1) In a semi-automated setup,the user annotates some slices in the volume to be segmented. The network learns from these sparse annotations and provides a dense 3D segmentation. (2) In a fully-automated setup,we assume that a representative,sparsely annotated training set exists. Trained on this data set,the network densely segments new volumetric images. The proposed network extends the previous u-net architecture from Ronneberger et al. by replacing all 2D operations with their 3D counterparts. The implementation performs on-the-fly elastic deformations for efficient data augmentation during training. It is trained end-to-end from scratch,i.e.,no pre-trained network is required. We test the performance of the proposed method on a complex,highly variable 3D structure,the Xenopus kidney,and achieve good results for both use cases.}, bibtype = {inproceedings}, author = {Çiçek, Özgün and Abdulkadir, Ahmed and Lienkamp, Soeren S. and Brox, Thomas and Ronneberger, Olaf}, doi = {10.1007/978-3-319-46723-8_49}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Receding horizon next-best-view planner for 3D exploration}, type = {article}, year = {2016}, pages = {1462-1468}, volume = {2016-June}, publisher = {IEEE}, id = {fb345b9a-85d9-3485-9bdf-6c49eef451e0}, created = {2021-02-09T17:05:46.737Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:02:39.854Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bircher2016}, folder_uuids = {d9de06c7-f364-422f-ad4b-132381b47b21}, private_publication = {false}, abstract = {This paper presents a novel path planning algorithm for the autonomous exploration of unknown space using aerial robotic platforms. The proposed planner employs a receding horizon next-best-view scheme: In an online computed random tree it finds the best branch, the quality of which is determined by the amount of unmapped space that can be explored. Only the first edge of this branch is executed at every planning step, while repetition of this procedure leads to complete exploration results. The proposed planner is capable of running online, onboard a robot with limited resources. Its high performance is evaluated in detailed simulation studies as well as in a challenging real world experiment using a rotorcraft micro aerial vehicle. Analysis on the computational complexity of the algorithm is provided and its good scaling properties enable the handling of large scale and complex problem setups.}, bibtype = {article}, author = {Bircher, Andreas and Kamel, Mina and Alexis, Kostas and Oleynikova, Helen and Siegwart, Roland}, doi = {10.1109/ICRA.2016.7487281}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {Next-Best-View method based on consecutive evaluation of topological relations}, type = {article}, year = {2016}, keywords = {Growing Neural Gas,LiDAR,Next-Best-View Problem,Object Reconstruction,Point Cloud}, pages = {11-19}, volume = {41}, id = {0cc08348-8e00-372f-8346-64087e72fb3d}, created = {2021-02-09T17:05:46.762Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:49.994Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {08ef6744-0b10-438e-84da-dc07f1f8ab47,d9de06c7-f364-422f-ad4b-132381b47b21}, private_publication = {false}, abstract = {This work describes an iterative algorithm for estimating optimal viewpoints, so called next-best-views (NBVs). The goal is to incrementally construct a topological network from the scene during the consecutive acquisition of several views. Our approach is a hybrid method between a surface-based and a volumetric approach with a continuous model space. Hence, a new scan taken from an optimal position should either cover as much as possible from the unknown object surface in one single scan, or densify the existing data and close possible gaps. Based on the point density, we recover the essential and structural information of a scene based on the Growing Neural Gas (GNG) algorithm. From the created graph representation of topological relations, the density of the point cloud at each network node is estimated by approximating the volume of Voronoi cells. The NBV Finder selects a network node as NBV, which has the lowest point density. Our NBV method is self-Terminating when all regions reach a predefined minimum point density or the change of the GNG error is zero. For evaluation, we use a Buddha statue with a rather simple surface geometry but still some concave parts and the Stanford Dragon with a more complex object surface containing occluded and concave parts. We demonstrate that our NBV method outperforms a "naive random" approach relying on uniformly distributed sensor positions in terms of efficiency, i.e. our proposed method reaches a desired minimum point density up to 20% faster with less scans.}, bibtype = {article}, author = {Dierenbach, K. O. and Weinmann, M. and Jutzi, B.}, doi = {10.5194/isprsarchives-XLI-B3-11-2016}, journal = {International Archives of the Photogrammetry, Remote Sensing and Spatial Information Sciences - ISPRS Archives}, number = {July} }
@article{ title = {An information gain formulation for active volumetric 3D reconstruction}, type = {article}, year = {2016}, pages = {3477-3484}, volume = {2016-June}, publisher = {IEEE}, id = {3cd29aea-6ddd-3a5c-a210-bed03c76fa89}, created = {2021-02-09T17:05:46.764Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:33.909Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Isler2016}, folder_uuids = {5439d198-93d5-4603-a7ce-201d423f231e,4f36a0a5-b08a-4f70-b020-4daf83cb0507,d9de06c7-f364-422f-ad4b-132381b47b21}, private_publication = {false}, abstract = {We consider the problem of next-best view selection for volumetric reconstruction of an object by a mobile robot equipped with a camera. Based on a probabilistic volumetric map that is built in real time, the robot can quantify the expected information gain from a set of discrete candidate views. We propose and evaluate several formulations to quantify this information gain for the volumetric reconstruction task, including visibility likelihood and the likelihood of seeing new parts of the object. These metrics are combined with the cost of robot movement in utility functions. The next best view is selected by optimizing these functions, aiming to maximize the likelihood of discovering new parts of the object. We evaluate the functions with simulated and real world experiments within a modular software system that is adaptable to other robotic platforms and reconstruction problems. We release our implementation open source.}, bibtype = {article}, author = {Isler, Stefan and Sabzevari, Reza and Delmerico, Jeffrey and Scaramuzza, Davide}, doi = {10.1109/ICRA.2016.7487527}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {PointNet: Deep Learning on Point Sets for 3D Classification and Segmentation}, type = {article}, year = {2016}, keywords = {dense 3D reconstruction,semantic segmentation}, pages = {601-610}, id = {1838e3a2-6979-3baf-8d99-ea5bcebcefeb}, created = {2021-02-15T14:14:03.759Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-22T10:55:22.725Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Qi2016}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {Techniques that jointly perform dense 3D reconstruction and semantic segmentation have recently shown very promising results. One major restriction so far is that they can often only handle a very low number of semantic labels. This is mostly due to their high memory consumption caused by the necessity to store indicator variables for every label and transition. We propose a way to reduce the memory consumption of existing methods. Our approach is based on the observation that many semantic labels are only present at very localized positions in the scene, such as cars. Therefore this label does not need to be active at every location. We exploit this observation by dividing the scene into blocks in which generally only a subset of labels is active. By determining early on in the reconstruction process which labels need to be active in which block the memory consumption can be significantly reduced. In order to recover from mistakes we propose to update the set of active labels during the iterative optimization procedure based on the current solution. We also propose a way to initialize the set of active labels using a boosted classifier. In our experimental evaluation we show the reduction of memory usage quantitatively. Eventually, we show results of joint semantic 3D reconstruction and semantic segmentation with significantly more labels than previous approaches were able to handle.}, bibtype = {article}, author = {Qi, Charles R. and Yi, Li and Su, Hao and Guibas, Leonidas J.}, doi = {10.1109/3DV.2016.68}, journal = {Proceedings - 2016 4th International Conference on 3D Vision, 3DV 2016} }
@inproceedings{ title = {Deep residual learning for image recognition}, type = {inproceedings}, year = {2016}, pages = {770-778}, volume = {2016-Decem}, websites = {http://image-net.org/challenges/LSVRC/2015/}, month = {12}, publisher = {IEEE Computer Society}, day = {9}, id = {8b52ddc2-f84c-32c2-9c5c-a2d10fd7d329}, created = {2021-05-05T09:35:03.581Z}, accessed = {2021-05-05}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-05-07T06:07:43.302Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.}, bibtype = {inproceedings}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, doi = {10.1109/CVPR.2016.90}, booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Convolutional neural networks on graphs with fast localized spectral filtering}, type = {article}, year = {2016}, pages = {3844-3852}, id = {56ff1845-0b5b-3077-ac1c-edababd94f45}, created = {2021-05-06T06:36:58.071Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:37.547Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {368c6572-df92-4840-8400-80e7c9ee2dd7,20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, abstract = {In this work, we are interested in generalizing convolutional neural networks (CNNs) from low-dimensional regular grids, where image, video and speech are represented, to high-dimensional irregular domains, such as social networks, brain connectomes or words' embedding, represented by graphs. We present a formulation of CNNs in the context of spectral graph theory, which provides the necessary mathematical background and efficient numerical schemes to design fast localized convolutional filters on graphs. Importantly, the proposed technique offers the same linear computational complexity and constant learning complexity as classical CNNs, while being universal to any graph structure. Experiments on MNIST and 20NEWS demonstrate the ability of this novel deep learning system to learn local, stationary, and compositional features on graphs.}, bibtype = {article}, author = {Defferrard, Michaël and Bresson, Xavier and Vandergheynst, Pierre}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@inproceedings{ title = {Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding}, type = {inproceedings}, year = {2016}, websites = {https://arxiv.org/abs/1510.00149v5}, month = {10}, publisher = {International Conference on Learning Representations, ICLR}, day = {1}, id = {856f549a-859a-3631-97c4-5ad4eccd2e79}, created = {2021-06-14T08:22:14.256Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:31:45.800Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4,cf9189f6-f354-4337-8aaf-a5f12cbf8660}, private_publication = {false}, abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems with limited hardware resources. To address this limitation, we introduce “deep compression”, a three stage pipeline: pruning, trained quantization and Huffman coding, that work together to reduce the storage requirement of neural networks by 35× to 49× without affecting their accuracy. Our method first prunes the network by learning only the important connections. Next, we quantize the weights to enforce weight sharing, finally, we apply Huffman coding. After the first two steps we retrain the network to fine tune the remaining connections and the quantized centroids. Pruning, reduces the number of connections by 9× to 13×; Quantization then reduces the number of bits that represent each connection from 32 to 5. On the ImageNet dataset, our method reduced the storage required by AlexNet by 35×, from 240MB to 6.9MB, without loss of accuracy. Our method reduced the size of VGG-16 by 49× from 552MB to 11.3MB, again with no loss of accuracy. This allows fitting the model into on-chip SRAM cache rather than off-chip DRAM memory. Our compression method also facilitates the use of complex neural networks in mobile applications where application size and download bandwidth are constrained. Benchmarked on CPU, GPU and mobile GPU, compressed network has 3× to 4× layerwise speedup and 3× to 7× better energy efficiency.}, bibtype = {inproceedings}, author = {Han, Song and Mao, Huizi and Dally, William J.}, booktitle = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings} }
@article{ title = {SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size}, type = {article}, year = {2016}, websites = {http://arxiv.org/abs/1602.07360}, month = {2}, day = {23}, id = {1bbb21ca-624a-3d07-a260-6be8dfbcc359}, created = {2021-06-14T08:54:54.958Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:55:33.221Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {908f2b91-eba2-44e9-9028-4350c78aceb0}, private_publication = {false}, abstract = {Recent research on deep neural networks has focused primarily on improving accuracy. For a given accuracy level, it is typically possible to identify multiple DNN architectures that achieve that accuracy level. With equivalent accuracy, smaller DNN architectures offer at least three advantages: (1) Smaller DNNs require less communication across servers during distributed training. (2) Smaller DNNs require less bandwidth to export a new model from the cloud to an autonomous car. (3) Smaller DNNs are more feasible to deploy on FPGAs and other hardware with limited memory. To provide all of these advantages, we propose a small DNN architecture called SqueezeNet. SqueezeNet achieves AlexNet-level accuracy on ImageNet with 50x fewer parameters. Additionally, with model compression techniques we are able to compress SqueezeNet to less than 0.5MB (510x smaller than AlexNet). The SqueezeNet architecture is available for download here: https://github.com/DeepScale/SqueezeNet}, bibtype = {article}, author = {Iandola, Forrest N. and Han, Song and Moskewicz, Matthew W. and Ashraf, Khalid and Dally, William J. and Keutzer, Kurt} }
@article{ title = {Deep residual learning for image recognition}, type = {article}, year = {2016}, pages = {770-778}, volume = {2016-Decem}, id = {dc8ba3fa-6629-38f3-80c9-b765b1f67578}, created = {2021-07-12T14:15:35.113Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:02.848Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.}, bibtype = {article}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, doi = {10.1109/CVPR.2016.90}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Local minima in training of neural networks}, type = {article}, year = {2016}, pages = {1-12}, websites = {http://arxiv.org/abs/1611.06310}, id = {53066766-0e43-3be5-ab6f-934e45c439f9}, created = {2021-07-12T14:15:35.822Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:53.031Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {There has been a lot of recent interest in trying to characterize the error surface of deep models. This stems from a long standing question. Given that deep networks are highly nonlinear systems optimized by local gradient methods, why do they not seem to be affected by bad local minima? It is widely believed that training of deep models using gradient methods works so well because the error surface either has no local minima, or if they exist they need to be close in value to the global minimum. It is known that such results hold under very strong assumptions which are not satisfied by real models. In this paper we present examples showing that for such theorem to be true additional assumptions on the data, initialization schemes and/or the model classes have to be made. We look at the particular case of finite size datasets. We demonstrate that in this scenario one can construct counter-examples (datasets or initialization schemes) when the network does become susceptible to bad local minima over the weight space.}, bibtype = {article}, author = {Swirszcz, Grzegorz and Czarnecki, Wojciech Marian and Pascanu, Razvan} }
@article{ title = {Ship Rotated Bounding Box Space for Ship Extraction from High-Resolution Optical Satellite Images with Complex Backgrounds}, type = {article}, year = {2016}, keywords = {Ship detection,ship extraction from complex backgrounds,ship rotated bounding box space}, pages = {1074-1078}, volume = {13}, id = {1f3784c3-6d66-34c7-bae3-3c80ec62be38}, created = {2021-07-12T14:15:35.956Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:55.484Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Extracting ships from complex backgrounds is the bottleneck of ship detection in high-resolution optical satellite images. In this letter, we propose a nearly closed-form ship rotated bounding box space used for ship detection and design a method to generate a small number of highly potential candidates based on this space. We first analyze the possibility of accurately covering all ships by labeling rotated bounding boxes. Moreover, to reduce search space, we construct a nearly closed-form ship rotated bounding box space. Then, by scoring for each latent candidate in the space using a two-cascaded linear model followed by binary linear programming, we select a small number of highly potential candidates. Moreover, we also propose a fast version of our method. Experiments on our data set validate the effectiveness of our method and the efficiency of its fast version, which achieves a close detection rate in near real time.}, bibtype = {article}, author = {Liu, Zikun and Wang, Hongzhen and Weng, Lubin and Yang, Yiping}, doi = {10.1109/LGRS.2016.2565705}, journal = {IEEE Geoscience and Remote Sensing Letters}, number = {8} }
@article{ title = {Convolutional Neural Networks on Graphs with Fast Localized Spectral Filtering}, type = {article}, year = {2016}, pages = {395-398}, websites = {http://ci.nii.ac.jp/naid/40021072263}, id = {732b2c8d-c9e7-3cc4-97ee-2cee62b9d693}, created = {2021-08-04T09:51:20.018Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.832Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Defferrard2016}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, abstract = {In this work, we are interested in generalizing convolutional neural networks (CNNs) from low-dimensional regular grids, where image, video and speech are represented, to high-dimensional irregular domains, such as social networks, brain connectomes or words' embedding, represented by graphs. We present a formulation of CNNs in the context of spectral graph theory, which provides the necessary mathematical background and efficient numerical schemes to design fast localized convolutional filters on graphs. Importantly, the proposed technique offers the same linear computational complexity and constant learning complexity as classical CNNs, while being universal to any graph structure. Experiments on MNIST and 20NEWS demonstrate the ability of this novel deep learning system to learn local, stationary, and compositional features on graphs.}, bibtype = {article}, author = {Defferrard, Michaël and Bresson, Xavier and Vandergheynst, Pierre}, journal = {日本建築学会北陸支部研究報告集}, number = {59} }
@article{ title = {Dense human body correspondences using convolutional networks}, type = {article}, year = {2016}, pages = {1544-1553}, volume = {2016-Decem}, id = {2a988fe6-7921-3501-b140-1c6c2750e666}, created = {2021-08-28T19:32:57.430Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-28T19:33:13.974Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We propose a deep learning approach for finding dense correspondences between 3D scans of people. Our method requires only partial geometric information in the form of two depth maps or partial reconstructed surfaces, works for humans in arbitrary poses and wearing any clothing, does not require the two people to be scanned from similar view-points, and runs in real time. We use a deep convolutional neural network to train a feature descriptor on depth map pixels, but crucially, rather than training the network to solve the shape correspondence problem directly, we train it to solve a body region classification problem, modified to increase the smoothness of the learned descriptors near region boundaries. This approach ensures that nearby points on the human body are nearby in feature space, and vice versa, rendering the feature descriptor suitable for computing dense correspondences between the scans. We validate our method on real and synthetic data for both clothed and unclothed humans, and show that our correspondences are more robust than is possible with state-of-the-art unsupervised methods, and more accurate than those found using methods that require full watertight 3D geometry.}, bibtype = {article}, author = {Wei, Lingyu and Huang, Qixing and Ceylan, Duygu and Vouga, Etienne and Li, Hao}, doi = {10.1109/CVPR.2016.171}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Recent Trends, Applications, and Perspectives in 3D Shape Similarity Assessment}, type = {article}, year = {2016}, keywords = {3D shape distances,3D shape matching,I.3.5 [Computer Graphics]: Computational Geometry,map-based correspondence}, pages = {87-119}, volume = {35}, id = {ffbaa3bd-a340-3c79-a95c-8220c5026bde}, created = {2021-08-28T19:32:57.474Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:24.355Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The recent introduction of 3D shape analysis frameworks able to quantify the deformation of a shape into another in terms of the variation of real functions yields a new interpretation of the 3D shape similarity assessment and opens new perspectives. Indeed, while the classical approaches to similarity mainly quantify it as a numerical score, map-based methods also define (dense) shape correspondences. After presenting in detail the theoretical foundations underlying these approaches, we classify them by looking at their most salient features, including the kind of structure and invariance properties they capture, as well as the distances and the output modalities according to which the similarity between shapes is assessed and returned. We also review the usage of these methods in a number of 3D shape application domains, ranging from matching and retrieval to annotation and segmentation. Finally, the most promising directions for future research developments are discussed.}, bibtype = {article}, author = {Biasotti, S. and Cerri, A. and Bronstein, A. and Bronstein, M.}, doi = {10.1111/cgf.12734}, journal = {Computer Graphics Forum}, number = {6} }
@article{ title = {Graph-based compression of dynamic 3D point cloud sequences}, type = {article}, year = {2016}, keywords = {3D sequences,graph-based features,motion compensation,spectral graph wavelets,voxels}, pages = {1765-1778}, volume = {25}, id = {2e948a93-2895-3c5d-8d43-e49865858b47}, created = {2021-08-29T22:27:22.258Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:22.265Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This paper addresses the problem of compression of 3D point cloud sequences that are characterized by moving 3D positions and color attributes. As temporally successive point cloud frames share some similarities, motion estimation is key to effective compression of these sequences. It, however, remains a challenging problem as the point cloud frames have varying numbers of points without explicit correspondence information. We represent the time-varying geometry of these sequences with a set of graphs, and consider 3D positions and color attributes of the point clouds as signals on the vertices of the graphs. We then cast motion estimation as a feature-matching problem between successive graphs. The motion is estimated on a sparse set of representative vertices using new spectral graph wavelet descriptors. A dense motion field is eventually interpolated by solving a graph-based regularization problem. The estimated motion is finally used for removing the temporal redundancy in the predictive coding of the 3D positions and the color characteristics of the point cloud sequences. Experimental results demonstrate that our method is able to accurately estimate the motion between consecutive frames. Moreover, motion estimation is shown to bring a significant improvement in terms of the overall compression performance of the sequence. To the best of our knowledge, this is the first paper that exploits both the spatial correlation inside each frame (through the graph) and the temporal correlation between the frames (through the motion estimation) to compress the color and the geometry of 3D point cloud sequences in an efficient way.}, bibtype = {article}, author = {Thanou, Dorina and Chou, Philip A. and Frossard, Pascal}, doi = {10.1109/TIP.2016.2529506}, journal = {IEEE Transactions on Image Processing}, number = {4} }
@article{ title = {Efficient and flexible deformation representation for data-driven surface modeling}, type = {article}, year = {2016}, volume = {35}, id = {45924720-3f5b-3aee-898b-416a8281617a}, created = {2021-09-15T06:44:56.130Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.874Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Gao2016}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Effectively characterizing the behavior of deformable objects has wide applicability but remains challenging. We present a new rotation-invariant deformation representation and a novel reconstruction algorithm to accurately reconstruct the positions and local rotations simultaneously. Meshes can be very efficiently reconstructed from our representation by matrix predecomposition, while, at the same time, hard or soft constraints can be flexibly specified with only positions of handles needed. Our approach is thus particularly suitable for constrained deformations guided by examples, providing significant benefits over state-of-The-Art methods. Based on this, we further propose novel data-driven approaches to mesh deformation and non-rigid registration of deformable objects. Both problems are formulated consistently as finding an optimized model in the shape space that satisfies boundary constraints, either specified by the user, or according to the scan. By effectively exploiting the knowledge in the shape space, our method produces realistic deformation results in real-Time and produces high quality registrations from a template model to a single noisy scan captured using a low-quality depth camera, outperforming state-of-The-Art methods. Categories and Subject Descriptors: I.3.7 [Computer Graphics]: Three-Dimensional Graphics and Realism-Animation; I.3.5 [Computer Graphics]: Computational Geometry and Object Modeling-Surface representation.}, bibtype = {article}, author = {Gao, Lin and Lai, Yu Kun and Liang, Dun and Chen, Shu Yu and Xia, Shihong}, doi = {10.1145/2908736}, journal = {ACM Transactions on Graphics}, number = {5} }
@article{ title = {Generative and Discriminative Voxel Modeling with Convolutional Neural Networks}, type = {article}, year = {2016}, websites = {http://arxiv.org/abs/1608.04236}, id = {4e904471-a10a-345a-944c-dd3243d1cd3c}, created = {2021-10-19T09:02:47.407Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:11.535Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Brock2016}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {When working with three-dimensional data, choice of representation is key. We explore voxel-based models, and present evidence for the viability of voxellated representations in applications including shape modeling and object classification. Our key contributions are methods for training voxel-based variational autoencoders, a user interface for exploring the latent space learned by the autoencoder, and a deep convolutional neural network architecture for object classification. We address challenges unique to voxel-based representations, and empirically evaluate our models on the ModelNet benchmark, where we demonstrate a 51.5% relative improvement in the state of the art for object classification.}, bibtype = {article}, author = {Brock, Andrew and Lim, Theodore and Ritchie, J. M. and Weston, Nick} }
@article{ title = {Group equivariant convolutional networks}, type = {article}, year = {2016}, keywords = {()}, pages = {4375-4386}, volume = {6}, id = {ed843f1c-e446-329c-944b-dc525bf99194}, created = {2021-10-26T08:17:02.750Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.855Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Cohen2016}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {We introduce Group equivariant Convolutional Neural Networks (G-CNNs), a natural general-ization of convolutional neural networks that reduces sample complexity by exploiting symmetries. G-CNNs use G-convolutions, a new type of layer that enjoys a substantially higher degree of weight sharing than regular convolution layers. G-convolutions increase the expressive capacity of the network without increasing the number of parameters. Group convolution layers are easy to use and can be implemented with negligible computational overhead for discrete groups gen-erated by translations, reflections and rotations. G-CNNs achieve state of the art results on CI- FAR10 and rotated MNIST.}, bibtype = {article}, author = {Cohen, Taco S. and Welling, Max}, journal = {33rd International Conference on Machine Learning, ICML 2016} }
@article{ title = {Multi-scale context aggregation by dilated convolutions}, type = {article}, year = {2016}, id = {a6d94979-ac3c-3ef6-a79f-9a6d5602cf15}, created = {2021-11-01T10:14:38.777Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.218Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yu2016}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {State-of-the-art models for semantic segmentation are based on adaptations of convolutional networks that had originally been designed for image classification. However, dense prediction problems such as semantic segmentation are structurally different from image classification. In this work, we develop a new convolutional network module that is specifically designed for dense prediction. The presented module uses dilated convolutions to systematically aggregate multi-scale contextual information without losing resolution. The architecture is based on the fact that dilated convolutions support exponential expansion of the receptive field without loss of resolution or coverage. We show that the presented context module increases the accuracy of state-of-the-art semantic segmentation systems. In addition, we examine the adaptation of image classification networks to dense prediction and show that simplifying the adapted network can increase accuracy.}, bibtype = {article}, author = {Yu, Fisher and Koltun, Vladlen}, journal = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings} }
@article{ title = {A Structured Variational Auto-encoder for Learning Deep Hierarchies of Sparse Features}, type = {article}, year = {2016}, pages = {1-3}, websites = {http://arxiv.org/abs/1602.08734}, id = {c2606a6d-dbdc-3cbc-b2cf-60d531cd6166}, created = {2021-11-26T10:09:16.159Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.709Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Salimans2016}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {In this note we present a generative model of natural images consisting of a deep hierarchy of layers of latent random variables, each of which follows a new type of distribution that we call rectified Gaussian. These rectified Gaussian units allow spike-and-slab type sparsity, while retaining the differentiability necessary for efficient stochastic gradient variational inference. To learn the parameters of the new model, we approximate the posterior of the latent variables with a variational auto-encoder. Rather than making the usual mean-field assumption however, the encoder parameterizes a new type of structured variational approximation that retains the prior dependencies of the generative model. Using this structured posterior approximation, we are able to perform joint training of deep models with many layers of latent random variables, without having to resort to stacking or other layerwise training procedures.}, bibtype = {article}, author = {Salimans, Tim} }
@article{ title = {Improved variational inference with inverse autoregressive flow}, type = {article}, year = {2016}, pages = {4743-4751}, id = {bc61de87-95a2-34cf-907f-dc6a7f4dd8b6}, created = {2021-11-26T10:09:16.277Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.503Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kingma2016}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434}, private_publication = {false}, abstract = {The framework of normalizing flows provides a general strategy for flexible variational inference of posteriors over latent variables. We propose a new type of normalizing flow, inverse autoregressive flow (IAF), that, in contrast to earlier published flows, scales well to high-dimensional latent spaces. The proposed flow consists of a chain of invertible transformations, where each transformation is based on an autoregressive neural network. In experiments, we show that IAF significantly improves upon diagonal Gaussian approximate posteriors. In addition, we demonstrate that a novel type of variational autoencoder, coupled with IAF, is competitive with neural autoregressive models in terms of attained log-likelihood on natural images, while allowing significantly faster synthesis.}, bibtype = {article}, author = {Kingma, Diederik P. and Salimans, Tim and Jozefowicz, Rafal and Chen, Xi and Sutskever, Ilya and Welling, Max}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {Improved variational inference with inverse autoregressive flow}, type = {article}, year = {2016}, pages = {4743-4751}, id = {9f0b0881-ff31-3049-96d2-5f032c00213e}, created = {2021-11-26T10:09:16.309Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.740Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kingma2016}, private_publication = {false}, abstract = {The framework of normalizing flows provides a general strategy for flexible variational inference of posteriors over latent variables. We propose a new type of normalizing flow, inverse autoregressive flow (IAF), that, in contrast to earlier published flows, scales well to high-dimensional latent spaces. The proposed flow consists of a chain of invertible transformations, where each transformation is based on an autoregressive neural network. In experiments, we show that IAF significantly improves upon diagonal Gaussian approximate posteriors. In addition, we demonstrate that a novel type of variational autoencoder, coupled with IAF, is competitive with neural autoregressive models in terms of attained log-likelihood on natural images, while allowing significantly faster synthesis.}, bibtype = {article}, author = {Kingma, Diederik P. and Salimans, Tim and Jozefowicz, Rafal and Chen, Xi and Sutskever, Ilya and Welling, Max}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {Deep Learning 3D Shape Surfaces Using Geometry Images}, type = {article}, year = {2016}, keywords = {deep learning}, pages = {VII-IX}, volume = {9906 LNCS}, id = {28ca6f85-183d-3cd5-b018-27c7dd57000b}, created = {2022-01-17T06:19:31.135Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.294Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {AyanSinhaJingBai2016}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, bibtype = {article}, author = {Ayan Sinha , JingBai, and Karthik Ramani}, doi = {10.1007/978-3-319-46466-4}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Deep learning 3D shape surfaces using geometry images - suplimentary mat}, type = {article}, year = {2016}, keywords = {3D shape,CNN,Deep learning,Geometry images,Surfaces}, pages = {223-240}, volume = {9910 LNCS}, id = {3cb4bc36-1017-3124-a77e-de0ebf282c45}, created = {2022-01-19T09:08:51.291Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.530Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Sinha2016}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {Surfaces serve as a natural parametrization to 3D shapes. Learning surfaces using convolutional neural networks (CNNs) is a challenging task. Current paradigms to tackle this challenge are to either adapt the convolutional filters to operate on surfaces, learn spectral descriptors defined by the Laplace-Beltrami operator, or to drop surfaces altogether in lieu of voxelized inputs. Here we adopt an approach of converting the 3D shape into a ‘geometry image’ so that standard CNNs can directly be used to learn 3D shapes. We qualitatively and quantitatively validate that creating geometry images using authalic parametrization on a spherical domain is suitable for robust learning of 3D shape surfaces. This spherically parameterized shape is then projected and cut to convert the original 3D shape into a flat and regular geometry image. We propose a way to implicitly learn the topology and structure of 3D shapes using geometry images encoded with suitable features. We show the efficacy of our approach to learn 3D shape surfaces for classification and retrieval tasks on non-rigid and rigid shape datasets.}, bibtype = {article}, author = {Sinha, Ayan and Bai, Jing and Ramani, Karthik}, doi = {10.1007/978-3-319-46466-4_14}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@article{ title = {Unsupervised representation learning with deep convolutional generative adversarial networks}, type = {article}, year = {2016}, pages = {1-16}, id = {2c609d84-c2c1-328d-9df4-4d7b9b2146da}, created = {2022-01-25T08:11:53.491Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.659Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Radford2016}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {In recent years, supervised learning with convolutional networks (CNNs) has seen huge adoption in computer vision applications. Comparatively, unsupervised learning with CNNs has received less attention. In this work we hope to help bridge the gap between the success of CNNs for supervised learning and unsupervised learning. We introduce a class of CNNs called deep convolutional generative adversarial networks (DCGANs), that have certain architectural constraints, and demonstrate that they are a strong candidate for unsupervised learning. Training on various image datasets, we show convincing evidence that our deep convolutional adversarial pair learns a hierarchy of representations from object parts to scenes in both the generator and discriminator. Additionally, we use the learned features for novel tasks - demonstrating their applicability as general image representations.}, bibtype = {article}, author = {Radford, Alec and Metz, Luke and Chintala, Soumith}, journal = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings} }
@article{ title = {Learning a probabilistic latent space of object shapes via 3D generative-adversarial modeling}, type = {article}, year = {2016}, pages = {82-90}, id = {88705a5a-78a1-37e1-a854-8d914409efe9}, created = {2022-02-08T07:56:57.258Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T17:25:38.256Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wu2016}, folder_uuids = {b6d75013-efe2-4ddc-b3db-65496bd4db9f,10e04504-7e21-4b84-9037-5a4431df1a8a,1853f94b-7af1-40fa-b068-4758e9a02bc4,a3bb779f-cc19-49e7-8c16-ed7f763ec870,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {We study the problem of 3D object generation. We propose a novel framework, namely 3D Generative Adversarial Network (3D-GAN), which generates 3D objects from a probabilistic space by leveraging recent advances in volumetric convo-lutional networks and generative adversarial nets. The benefits of our model are three-fold: first, the use of an adversarial criterion, instead of traditional heuristic criteria, enables the generator to capture object structure implicitly and to synthesize high-quality 3D objects; second, the generator establishes a mapping from a low-dimensional probabilistic space to the space of 3D objects, so that we can sample objects without a reference image or CAD models, and explore the 3D object manifold; third, the adversarial discriminator provides a powerful 3D shape descriptor which, learned without supervision, has wide applications in 3D object recognition. Experiments demonstrate that our method generates high-quality 3D objects, and our unsupervisedly learned features achieve impressive performance on 3D object recognition, comparable with those of supervised learning methods.}, bibtype = {article}, author = {Wu, Jiajun and Zhang, Chengkai and Xue, Tianfan and Freeman, William T. and Tenenbaum, Joshua B.}, journal = {Advances in Neural Information Processing Systems}, number = {Nips} }
@article{ title = {3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions}, type = {article}, year = {2016}, pages = {199-208}, volume = {2017-Janua}, websites = {https://arxiv.org/abs/1603.08182v3}, month = {3}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, day = {27}, id = {a74d5abc-69e1-3576-b4f1-447fa2b148ca}, created = {2022-02-15T12:30:04.925Z}, accessed = {2022-02-15}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T06:51:19.210Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Matching local geometric features on real-world depth images is a challenging task due to the noisy, low-resolution, and incomplete nature of 3D scan data. These difficulties limit the performance of current state-of-art methods, which are typically based on histograms over geometric properties. In this paper, we present 3DMatch, a data-driven model that learns a local volumetric patch descriptor for establishing correspondences between partial 3D data. To amass training data for our model, we propose a self-supervised feature learning method that leverages the millions of correspondence labels found in existing RGB-D reconstructions. Experiments show that our descriptor is not only able to match local geometry in new scenes for reconstruction, but also generalize to different tasks and spatial scales (e.g. instance-level object model alignment for the Amazon Picking Challenge, and mesh surface correspondence). Results show that 3DMatch consistently outperforms other state-of-the-art approaches by a significant margin. Code, data, benchmarks, and pre-trained models are available online at http://3dmatch.cs.princeton.edu}, bibtype = {article}, author = {Zeng, Andy and Song, Shuran and Nießner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas}, doi = {10.1109/CVPR.2017.29}, journal = {Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017} }
@article{ title = {Fast Semantic Segmentation of 3D Point Clouds With Strongly Varying Density}, type = {article}, year = {2016}, keywords = {features,lidar,multiscale,point clouds,scene understanding,semantic classification}, pages = {177-184}, volume = {III-3}, id = {08392be4-10c8-3c51-96f9-8537c7ebfc6e}, created = {2022-02-24T07:44:06.229Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-28T10:56:57.328Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {We describe an effective and efficient method for point-wise semantic classification of 3D point clouds. The method can handle unstructured and inhomogeneous point clouds such as those derived from static terrestrial LiDAR or photogammetric reconstruction; and it is computationally efficient, making it possible to process point clouds with many millions of points in a matter of minutes. The key issue, both to cope with strong variations in point density and to bring down computation time, turns out to be careful handling of neighborhood relations. By choosing appropriate definitions of a point’s (multi-scale) neighborhood, we obtain a feature set that is both expressive and fast to compute. We evaluate our classification method both on benchmark data from a mobile mapping platform and on a variety of large, terrestrial laser scans with greatly varying point density. The proposed feature set outperforms the state of the art with respect to per-point classification accuracy, while at the same time being much faster to compute.}, bibtype = {article}, author = {Hackel, Timo and Wegner, Jan D. and Schindler, Konrad}, doi = {10.5194/isprsannals-iii-3-177-2016}, journal = {ISPRS Annals of Photogrammetry, Remote Sensing and Spatial Information Sciences} }
@article{ title = {Data Science with Graphs: A Signal Processing Perspective}, type = {article}, year = {2016}, keywords = {0544:Electrical engineering,0984:Computer science,Applied sciences,Computer science,Data mining,Electrical engineering,Machine learning,Network science,Signal processing}, pages = {274}, websites = {https://manchester.idm.oclc.org/login?url=https://search.proquest.com/docview/1864668231?accountid=12253%0Ahttp://man-fe.hosted.exlibrisgroup.com/openurl/44MAN/44MAN_services_page?genre=dissertations+%26+theses&atitle=&author=Chen%2C+Siheng&volume=&issue=}, id = {dc63b210-6f9e-301a-92c0-341d1094e142}, created = {2022-03-02T07:02:50.311Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T07:03:00.387Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {A massive amount of data is being generated at an unprecedented level from a diversity of sources, including social media, internet services, biological studies, physical infrastructure monitoring and many others. The necessity of analyzing such complex data has led to the birth of an emerging framework, graph signal processing. This framework offers an unified and mathematically rigorous paradigm for the analysis of high-dimensional data with complex and irregular structure. It extends fundamental signal processing concepts such as signals, Fourier transform, frequency response and filtering, from signals residing on regular lattices, which have been studied by the classical signal processing theory, to data residing on general graphs, which are called graph signals. In this thesis, we consider five fundamental tasks on graphs from the perspective of graph signal processing: representation, sampling, recovery, detection and localization. Representation, aiming to concisely model shapes of graph signals, is at the heart of the proposed techniques. Sampling followed by recovery, aiming to reconstruct an original graph signal from a few selected samples, is applicable in semi-supervised learning and user profiling in online social networks. Detection followed by localization, aiming to identify and localize targeted patterns in noisy graph signals, is related to many real-world applications, such as localizing virus attacks in cyber-physical systems, localizing stimuli in brain connectivity networks, and mining traffic events in city street networks, to name just a few. We illustrate the power of the proposed tools on two real-world problems: fast resampling of 3D point clouds and mining of urban traffic data.}, bibtype = {article}, author = {Chen, Siheng}, journal = {ProQuest Dissertations and Theses} }
@inproceedings{ title = {Face Alignment Across Large Poses: A 3D Solution}, type = {inproceedings}, year = {2016}, pages = {146-155}, websites = {https://openaccess.thecvf.com/content_cvpr_2016/html/Zhu_Face_Alignment_Across_CVPR_2016_paper.html}, id = {6f573dd7-0665-3bed-9107-390acb7e0e52}, created = {2022-03-28T09:45:00.926Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:31.189Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zhuFaceAlignmentLarge2016}, source_type = {inproceedings}, short_title = {Face Alignment Across Large Poses}, private_publication = {false}, bibtype = {inproceedings}, author = {Zhu, Xiangyu and Lei, Zhen and Liu, Xiaoming and Shi, Hailin and Li, Stan Z} }
@article{ title = {Adversarial Autoencoders}, type = {article}, year = {2016}, keywords = {Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1511.05644}, month = {5}, id = {0b58934c-3777-3d5e-b23c-1ca022b259a9}, created = {2022-03-28T09:45:01.935Z}, accessed = {2021-10-01}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:08.121Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {makhzaniAdversarialAutoencoders2016}, source_type = {article}, notes = {arXiv: 1511.05644}, private_publication = {false}, abstract = {In this paper, we propose the "adversarial autoencoder" (AAE), which is a probabilistic autoencoder that uses the recently proposed generative adversarial networks (GAN) to perform variational inference by matching the aggregated posterior of the hidden code vector of the autoencoder with an arbitrary prior distribution. Matching the aggregated posterior to the prior ensures that generating from any part of prior space results in meaningful samples. As a result, the decoder of the adversarial autoencoder learns a deep generative model that maps the imposed prior to the data distribution. We show how the adversarial autoencoder can be used in applications such as semi-supervised classification, disentangling style and content of images, unsupervised clustering, dimensionality reduction and data visualization. We performed experiments on MNIST, Street View House Numbers and Toronto Face datasets and show that adversarial autoencoders achieve competitive results in generative modeling and semi-supervised classification tasks.}, bibtype = {article}, author = {Makhzani, Alireza and Shlens, Jonathon and Jaitly, Navdeep and Goodfellow, Ian and Frey, Brendan}, journal = {arXiv:1511.05644 [cs]} }
@inproceedings{ title = {Deep Residual Learning for Image Recognition}, type = {inproceedings}, year = {2016}, pages = {770-778}, websites = {https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html}, id = {533eab48-7c62-340b-b39e-6ad404e1ce3d}, created = {2022-03-28T09:45:01.960Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:32.485Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {heDeepResidualLearning2016}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian} }
@inproceedings{ title = {Unsupervised Deep Embedding for Clustering Analysis}, type = {inproceedings}, year = {2016}, pages = {478-487}, websites = {https://proceedings.mlr.press/v48/xieb16.html}, month = {6}, publisher = {PMLR}, id = {86aee160-b3b4-300a-a442-d386d5768488}, created = {2022-03-28T09:45:02.232Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:30.293Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {xieUnsupervisedDeepEmbedding2016}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {Clustering is central to many data-driven application domains and has been studied extensively in terms of distance functions and grouping algorithms. Relatively little work has focused on learning representations for clustering. In this paper, we propose Deep Embedded Clustering (DEC), a method that simultaneously learns feature representations and cluster assignments using deep neural networks. DEC learns a mapping from the data space to a lower-dimensional feature space in which it iteratively optimizes a clustering objective. Our experimental evaluations on image and text corpora show significant improvement over state-of-the-art methods.}, bibtype = {inproceedings}, author = {Xie, Junyuan and Girshick, Ross and Farhadi, Ali}, booktitle = {Proceedings of The 33rd International Conference on Machine Learning} }
@article{ title = {Theano: A Python framework for fast computation of mathematical expressions}, type = {article}, year = {2016}, keywords = {Computer Science - Machine Learning,Computer Science - Mathematical Software,Computer Science - Symbolic Computation}, websites = {http://arxiv.org/abs/1605.02688}, month = {5}, id = {d3f1b862-4105-3f35-8b2c-d31400661f53}, created = {2022-03-28T09:45:02.292Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:01.594Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {thetheanodevelopmentteamTheanoPythonFramework2016}, source_type = {article}, short_title = {Theano}, notes = {arXiv: 1605.02688}, private_publication = {false}, abstract = {Theano is a Python library that allows to define, optimize, and evaluate mathematical expressions involving multi-dimensional arrays efficiently. Since its introduction, it has been one of the most used CPU and GPU mathematical compilers - especially in the machine learning community - and has shown steady performance improvements. Theano is being actively and continuously developed since 2008, multiple frameworks have been built on top of it and it has been used to produce many state-of-the-art machine learning models. The present article is structured as follows. Section I provides an overview of the Theano software and its community. Section II presents the principal features of Theano and how to use them, and compares them with other similar projects. Section III focuses on recently-introduced functionalities and improvements. Section IV compares the performance of Theano against Torch7 and TensorFlow on several machine learning models. Section V discusses current limitations of Theano and potential ways of improving it.}, bibtype = {article}, author = {Team, The Theano Development and Al-Rfou, Rami and Alain, Guillaume and Almahairi, Amjad and Angermueller, Christof and Bahdanau, Dzmitry and Ballas, Nicolas and Bastien, Frédéric and Bayer, Justin and Belikov, Anatoly and Belopolsky, Alexander and Bengio, Yoshua and Bergeron, Arnaud and Bergstra, James and Bisson, Valentin and Snyder, Josh Bleecher and Bouchard, Nicolas and Boulanger-Lewandowski, Nicolas and Bouthillier, Xavier and de Brébisson, Alexandre and Breuleux, Olivier and Carrier, Pierre-Luc and Cho, Kyunghyun and Chorowski, Jan and Christiano, Paul and Cooijmans, Tim and Côté, Marc-Alexandre and Côté, Myriam and Courville, Aaron and Dauphin, Yann N and Delalleau, Olivier and Demouth, Julien and Desjardins, Guillaume and Dieleman, Sander and Dinh, Laurent and Ducoffe, Mélanie and Dumoulin, Vincent and Kahou, Samira Ebrahimi and Erhan, Dumitru and Fan, Ziye and Firat, Orhan and Germain, Mathieu and Glorot, Xavier and Goodfellow, Ian and Graham, Matt and Gulcehre, Caglar and Hamel, Philippe and Harlouchet, Iban and Heng, Jean-Philippe and Hidasi, Balázs and Honari, Sina and Jain, Arjun and Jean, Sébastien and Jia, Kai and Korobov, Mikhail and Kulkarni, Vivek and Lamb, Alex and Lamblin, Pascal and Larsen, Eric and Laurent, César and Lee, Sean and Lefrancois, Simon and Lemieux, Simon and Léonard, Nicholas and Lin, Zhouhan and Livezey, Jesse A and Lorenz, Cory and Lowin, Jeremiah and Ma, Qianli and Manzagol, Pierre-Antoine and Mastropietro, Olivier and McGibbon, Robert T and Memisevic, Roland and van Merriënboer, Bart and Michalski, Vincent and Mirza, Mehdi and Orlandi, Alberto and Pal, Christopher and Pascanu, Razvan and Pezeshki, Mohammad and Raffel, Colin and Renshaw, Daniel and Rocklin, Matthew and Romero, Adriana and Roth, Markus and Sadowski, Peter and Salvatier, John and Savard, François and Schlüter, Jan and Schulman, John and Schwartz, Gabriel and Serban, Iulian Vlad and Serdyuk, Dmitriy and Shabanian, Samira and Simon, Étienne and Spieckermann, Sigurd and Subramanyam, S Ramana and Sygnowski, Jakub and Tanguay, Jérémie and van Tulder, Gijs and Turian, Joseph and Urban, Sebastian and Vincent, Pascal and Visin, Francesco and de Vries, Harm and Warde-Farley, David and Webb, Dustin J and Willson, Matthew and Xu, Kelvin and Xue, Lijun and Yao, Li and Zhang, Saizheng and Zhang, Ying}, journal = {arXiv:1605.02688 [cs]} }
@article{ title = {Generating Sentences from a Continuous Space}, type = {article}, year = {2016}, keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1511.06349}, month = {5}, id = {8ce71856-eb90-3d63-8ec5-a2be7ac26b70}, created = {2022-03-28T09:45:02.566Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:45.399Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bowmanGeneratingSentencesContinuous2016}, source_type = {article}, notes = {arXiv: 1511.06349}, private_publication = {false}, abstract = {The standard recurrent neural network language model (RNNLM) generates sentences one word at a time and does not work from an explicit global sentence representation. In this work, we introduce and study an RNN-based variational autoencoder generative model that incorporates distributed latent representations of entire sentences. This factorization allows it to explicitly model holistic properties of sentences such as style, topic, and high-level syntactic features. Samples from the prior over these sentence representations remarkably produce diverse and well-formed sentences through simple deterministic decoding. By examining paths through this latent space, we are able to generate coherent novel sentences that interpolate between known sentences. We present techniques for solving the difficult learning problem presented by this model, demonstrate its effectiveness in imputing missing words, explore many interesting properties of the model's latent sentence space, and present negative results on the use of the model in language modeling.}, bibtype = {article}, author = {Bowman, Samuel R and Vilnis, Luke and Vinyals, Oriol and Dai, Andrew M and Jozefowicz, Rafal and Bengio, Samy}, journal = {arXiv:1511.06349 [cs]} }
@inproceedings{ title = {Keep It SMPL: Automatic Estimation of 3D Human Pose and Shape from a Single Image}, type = {inproceedings}, year = {2016}, keywords = {2D to 3D,3D body shape,CNN,Human pose}, pages = {561-578}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {5fcc0d3a-9dc1-32d5-bd4d-503d4cb9cde7}, created = {2022-03-28T09:45:03.110Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:56.513Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bogoKeepItSMPL2016}, source_type = {inproceedings}, short_title = {Keep It SMPL}, private_publication = {false}, abstract = {We describe the first method to automatically estimate the 3D pose of the human body as well as its 3D shape from a single unconstrained image. We estimate a full 3D mesh and show that 2D joints alone carry a surprising amount of information about body shape. The problem is challenging because of the complexity of the human body, articulation, occlusion, clothing, lighting, and the inherent ambiguity in inferring 3D from 2D. To solve this, we first use a recently published CNN-based method, DeepCut, to predict (bottom-up) the 2D body joint locations. We then fit (top-down) a recently published statistical body shape model, called SMPL, to the 2D joints. We do so by minimizing an objective function that penalizes the error between the projected 3D model joints and detected 2D joints. Because SMPL captures correlations in human shape across the population, we are able to robustly fit it to very little data. We further leverage the 3D model to prevent solutions that cause interpenetration. We evaluate our method, SMPLify, on the Leeds Sports, HumanEva, and Human3.6M datasets, showing superior pose accuracy with respect to the state of the art.}, bibtype = {inproceedings}, author = {Bogo, Federica and Kanazawa, Angjoo and Lassner, Christoph and Gehler, Peter and Romero, Javier and Black, Michael J}, editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max}, doi = {10.1007/978-3-319-46454-1_34}, booktitle = {Computer Vision – ECCV 2016} }
@article{ title = {An Uncertain Future: Forecasting from Static Images using Variational Autoencoders}, type = {article}, year = {2016}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/1606.07873}, month = {6}, id = {19ccd642-6545-3b43-88cc-5dd2e04ef185}, created = {2022-03-28T09:45:03.146Z}, accessed = {2021-09-28}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:54.446Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {walkerUncertainFutureForecasting2016}, source_type = {article}, short_title = {An Uncertain Future}, notes = {arXiv: 1606.07873}, private_publication = {false}, abstract = {In a given scene, humans can often easily predict a set of immediate future events that might happen. However, generalized pixel-level anticipation in computer vision systems is difficult because machine learning struggles with the ambiguity inherent in predicting the future. In this paper, we focus on predicting the dense trajectory of pixels in a scene, specifically what will move in the scene, where it will travel, and how it will deform over the course of one second. We propose a conditional variational autoencoder as a solution to this problem. In this framework, direct inference from the image shapes the distribution of possible trajectories, while latent variables encode any necessary information that is not available in the image. We show that our method is able to successfully predict events in a wide variety of scenes and can produce multiple different predictions when the future is ambiguous. Our algorithm is trained on thousands of diverse, realistic videos and requires absolutely no human labeling. In addition to non-semantic action prediction, we find that our method learns a representation that is applicable to semantic vision tasks.}, bibtype = {article}, author = {Walker, Jacob and Doersch, Carl and Gupta, Abhinav and Hebert, Martial}, journal = {arXiv:1606.07873 [cs]} }
@inproceedings{ title = {Ladder Variational Autoencoders}, type = {inproceedings}, year = {2016}, volume = {29}, websites = {https://proceedings.neurips.cc/paper/2016/hash/6ae07dcb33ec3b7c814df797cbda0f87-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {2f459aef-f3ab-3424-924d-cfec4b393980}, created = {2022-03-28T09:45:03.357Z}, accessed = {2022-03-16}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:57.719Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sonderbyLadderVariationalAutoencoders2016a}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Sø nderby, Casper Kaae and Raiko, Tapani and Maalø e, Lars and Sø nderby, Sø ren Kaae and Winther, Ole}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)}, type = {article}, year = {2016}, keywords = {Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1511.07289}, month = {2}, id = {02f23eb4-c2ba-318d-bf30-2b41637b9685}, created = {2022-03-28T09:45:03.426Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:02.864Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {clevertFastAccurateDeep2016}, source_type = {article}, notes = {arXiv: 1511.07289}, private_publication = {false}, abstract = {We introduce the "exponential linear unit" (ELU) which speeds up learning in deep neural networks and leads to higher classification accuracies. Like rectified linear units (ReLUs), leaky ReLUs (LReLUs) and parametrized ReLUs (PReLUs), ELUs alleviate the vanishing gradient problem via the identity for positive values. However, ELUs have improved learning characteristics compared to the units with other activation functions. In contrast to ReLUs, ELUs have negative values which allows them to push mean unit activations closer to zero like batch normalization but with lower computational complexity. Mean shifts toward zero speed up learning by bringing the normal gradient closer to the unit natural gradient because of a reduced bias shift effect. While LReLUs and PReLUs have negative values, too, they do not ensure a noise-robust deactivation state. ELUs saturate to a negative value with smaller inputs and thereby decrease the forward propagated variation and information. Therefore, ELUs code the degree of presence of particular phenomena in the input, while they do not quantitatively model the degree of their absence. In experiments, ELUs lead not only to faster learning, but also to significantly better generalization performance than ReLUs and LReLUs on networks with more than 5 layers. On CIFAR-100 ELUs networks significantly outperform ReLU networks with batch normalization while batch normalization does not improve ELU networks. ELU networks are among the top 10 reported CIFAR-10 results and yield the best published result on CIFAR-100, without resorting to multi-view evaluation or model averaging. On ImageNet, ELU networks considerably speed up learning compared to a ReLU network with the same architecture, obtaining less than 10\% classification error for a single crop, single model network.}, bibtype = {article}, author = {Clevert, Djork-Arné and Unterthiner, Thomas and Hochreiter, Sepp}, journal = {arXiv:1511.07289 [cs]} }
@book{ title = {Deep Learning}, type = {book}, year = {2016}, keywords = {Computers / Artificial Intelligence / General,Computers / Computer Science}, month = {11}, publisher = {MIT Press}, id = {d79d6e16-0b77-33a5-b269-a603cc0c5e06}, created = {2022-03-28T09:45:03.721Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:03.721Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {goodfellowDeepLearning2016}, source_type = {book}, notes = {Google-Books-ID: omivDQAAQBAJ}, private_publication = {false}, abstract = {An introduction to a broad range of topics in deep learning, covering mathematical and conceptual background, deep learning techniques used in industry, and research perspectives.“Written by three experts in the field, Deep Learning is the only comprehensive book on the subject.”—Elon Musk, cochair of OpenAI; cofounder and CEO of Tesla and SpaceXDeep learning is a form of machine learning that enables computers to learn from experience and understand the world in terms of a hierarchy of concepts. Because the computer gathers knowledge from experience, there is no need for a human computer operator to formally specify all the knowledge that the computer needs. The hierarchy of concepts allows the computer to learn complicated concepts by building them out of simpler ones; a graph of these hierarchies would be many layers deep. This book introduces a broad range of topics in deep learning. The text offers mathematical and conceptual background, covering relevant concepts in linear algebra, probability theory and information theory, numerical computation, and machine learning. It describes deep learning techniques used by practitioners in industry, including deep feedforward networks, regularization, optimization algorithms, convolutional networks, sequence modeling, and practical methodology; and it surveys such applications as natural language processing, speech recognition, computer vision, online recommendation systems, bioinformatics, and videogames. Finally, the book offers research perspectives, covering such theoretical topics as linear factor models, autoencoders, representation learning, structured probabilistic models, Monte Carlo methods, the partition function, approximate inference, and deep generative models. Deep Learning can be used by undergraduate or graduate students planning careers in either industry or research, and by software engineers who want to begin using deep learning in their products or platforms. A website offers supplementary material for both readers and instructors.}, bibtype = {book}, author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron} }
@article{ title = {3D Hand Pose Tracking and Estimation Using Stereo Matching}, type = {article}, year = {2016}, keywords = {Computer Science - Computer Vision and Pattern Rec}, websites = {http://arxiv.org/abs/1610.07214}, month = {10}, id = {94becb36-2643-3cc8-bc61-499601b04c78}, created = {2022-03-28T09:45:03.827Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:02.360Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zhang3DHandPose2016}, source_type = {article}, notes = {arXiv: 1610.07214}, private_publication = {false}, abstract = {3D hand pose tracking/estimation will be very important in the next generation of human-computer interaction. Most of the currently available algorithms rely on low-cost active depth sensors. However, these sensors can be easily interfered by other active sources and require relatively high power consumption. As a result, they are currently not suitable for outdoor environments and mobile devices. This paper aims at tracking/estimating hand poses using passive stereo which avoids these limitations. A benchmark with 18,000 stereo image pairs and 18,000 depth images captured from different scenarios and the ground-truth 3D positions of palm and finger joints (obtained from the manual label) is thus proposed. This paper demonstrates that the performance of the state-of-the art tracking/estimation algorithms can be maintained with most stereo matching algorithms on the proposed benchmark, as long as the hand segmentation is correct. As a result, a novel stereo-based hand segmentation algorithm specially designed for hand tracking/estimation is proposed. The quantitative evaluation demonstrates that the proposed algorithm is suitable for the state-of-the-art hand pose tracking/estimation algorithms and the tracking quality is comparable to the use of active depth sensors under different challenging scenarios.}, bibtype = {article}, author = {Zhang, Jiawei and Jiao, Jianbo and Chen, Mingliang and Qu, Liangqiong and Xu, Xiaobin and Yang, Qingxiong}, journal = {arXiv:1610.07214 [cs]} }
@article{ title = {PixelVAE: A Latent Variable Model for Natural Images}, type = {article}, year = {2016}, keywords = {Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1611.05013}, month = {11}, id = {0cc9a3bf-0f05-340c-8ff8-85f1a4c43af9}, created = {2022-03-28T09:45:04.043Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:23.225Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gulrajaniPixelVAELatentVariable2016}, source_type = {article}, short_title = {PixelVAE}, notes = {arXiv: 1611.05013}, private_publication = {false}, abstract = {Natural image modeling is a landmark challenge of unsupervised learning. Variational Autoencoders (VAEs) learn a useful latent representation and model global structure well but have difficulty capturing small details. PixelCNN models details very well, but lacks a latent code and is difficult to scale for capturing large structures. We present PixelVAE, a VAE model with an autoregressive decoder based on PixelCNN. Our model requires very few expensive autoregressive layers compared to PixelCNN and learns latent codes that are more compressed than a standard VAE while still capturing most non-trivial structure. Finally, we extend our model to a hierarchy of latent variables at different scales. Our model achieves state-of-the-art performance on binarized MNIST, competitive performance on 64x64 ImageNet, and high-quality samples on the LSUN bedrooms dataset.}, bibtype = {article}, author = {Gulrajani, Ishaan and Kumar, Kundan and Ahmed, Faruk and Taiga, Adrien Ali and Visin, Francesco and Vazquez, David and Courville, Aaron}, journal = {arXiv:1611.05013 [cs]} }
@inproceedings{ title = {3D Semantic Parsing of Large-Scale Indoor Spaces}, type = {inproceedings}, year = {2016}, pages = {1534-1543}, websites = {https://openaccess.thecvf.com/content_cvpr_2016/html/Armeni_3D_Semantic_Parsing_CVPR_2016_paper.html}, id = {0a48e954-813a-366c-9c79-340c47027741}, created = {2022-03-28T09:45:04.173Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:44.167Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {armeni3DSemanticParsing2016}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Armeni, Iro and Sener, Ozan and Zamir, Amir R and Jiang, Helen and Brilakis, Ioannis and Fischer, Martin and Savarese, Silvio} }
@inproceedings{ title = {NTU RGB+D: A Large Scale Dataset for 3D Human Activity Analysis}, type = {inproceedings}, year = {2016}, pages = {1010-1019}, websites = {https://openaccess.thecvf.com/content_cvpr_2016/html/Shahroudy_NTU_RGBD_A_CVPR_2016_paper.html}, id = {6245c469-1862-3201-84c2-08a7a2979dc0}, created = {2022-03-28T09:45:04.314Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:32.820Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {shahroudyNTURGBLarge2016}, source_type = {inproceedings}, short_title = {NTU RGB+D}, private_publication = {false}, bibtype = {inproceedings}, author = {Shahroudy, Amir and Liu, Jun and Ng, Tian-Tsong and Wang, Gang} }
@inproceedings{ title = {Deep Learning 3D Shape Surfaces Using Geometry Images}, type = {inproceedings}, year = {2016}, keywords = {3D Shape,CNN,Deep learning,Geometry images,Surfaces}, pages = {223-240}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {4ec2b795-877c-3dab-8f88-59afd7c4d24b}, created = {2022-03-28T09:45:05.145Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:00.204Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sinhaDeepLearning3D2016}, source_type = {inproceedings}, private_publication = {false}, abstract = {Surfaces serve as a natural parametrization to 3D shapes. Learning surfaces using convolutional neural networks (CNNs) is a challenging task. Current paradigms to tackle this challenge are to either adapt the convolutional filters to operate on surfaces, learn spectral descriptors defined by the Laplace-Beltrami operator, or to drop surfaces altogether in lieu of voxelized inputs. Here we adopt an approach of converting the 3D shape into a ‘geometry image’ so that standard CNNs can directly be used to learn 3D shapes. We qualitatively and quantitatively validate that creating geometry images using authalic parametrization on a spherical domain is suitable for robust learning of 3D shape surfaces. This spherically parameterized shape is then projected and cut to convert the original 3D shape into a flat and regular geometry image. We propose a way to implicitly learn the topology and structure of 3D shapes using geometry images encoded with suitable features. We show the efficacy of our approach to learn 3D shape surfaces for classification and retrieval tasks on non-rigid and rigid shape datasets.}, bibtype = {inproceedings}, author = {Sinha, Ayan and Bai, Jing and Ramani, Karthik}, editor = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max}, doi = {10.1007/978-3-319-46466-4_14}, booktitle = {Computer Vision – ECCV 2016} }
@inproceedings{ title = {Autoencoding beyond pixels using a learned similarity metric}, type = {inproceedings}, year = {2016}, pages = {1558-1566}, websites = {https://proceedings.mlr.press/v48/larsen16.html}, month = {6}, publisher = {PMLR}, id = {832ca8f7-22e1-3066-a37a-8dc7f7a09b99}, created = {2022-03-28T09:45:05.728Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:52.165Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {larsenAutoencodingPixelsUsing2016}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {We present an autoencoder that leverages learned representations to better measure similarities in data space. By combining a variational autoencoder (VAE) with a generative adversarial network (GAN) we can use learned feature representations in the GAN discriminator as basis for the VAE reconstruction objective. Thereby, we replace element-wise errors with feature-wise errors to better capture the data distribution while offering invariance towards e.g. translation. We apply our method to images of faces and show that it outperforms VAEs with element-wise similarity measures in terms of visual fidelity. Moreover, we show that the method learns an embedding in which high-level abstract visual features (e.g. wearing glasses) can be modified using simple arithmetic.}, bibtype = {inproceedings}, author = {Larsen, Anders Boesen Lindbo and Sønderby, Søren Kaae and Larochelle, Hugo and Winther, Ole}, booktitle = {Proceedings of The 33rd International Conference on Machine Learning} }
@inproceedings{ title = {f-GAN: Training Generative Neural Samplers using Variational Divergence Minimization}, type = {inproceedings}, year = {2016}, volume = {29}, websites = {https://proceedings.neurips.cc/paper/2016/hash/cedebb6e872f539bef8c3f919874e9d7-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {372f8b8b-b6ba-33b6-9019-cee50acb5d54}, created = {2022-03-28T09:45:06.358Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:38.122Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {nowozinFGANTrainingGenerative2016}, source_type = {inproceedings}, short_title = {f-GAN}, private_publication = {false}, bibtype = {inproceedings}, author = {Nowozin, Sebastian and Cseke, Botond and Tomioka, Ryota}, booktitle = {Advances in Neural Information Processing Systems} }
@inproceedings{ title = {Attend, Infer, Repeat: Fast Scene Understanding with Generative Models}, type = {inproceedings}, year = {2016}, volume = {29}, websites = {https://proceedings.neurips.cc/paper/2016/hash/52947e0ade57a09e4a1386d08f17b656-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {3e6231b1-16da-3e76-8c14-68406c3d0fdf}, created = {2022-03-28T09:45:06.425Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:10.645Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {eslamiAttendInferRepeat2016}, source_type = {inproceedings}, short_title = {Attend, Infer, Repeat}, private_publication = {false}, bibtype = {inproceedings}, author = {Eslami, S M Ali and Heess, Nicolas and Weber, Theophane and Tassa, Yuval and Szepesvari, David and kavukcuoglu, koray and Hinton, Geoffrey E}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework}, type = {article}, year = {2016}, websites = {https://openreview.net/forum?id=Sy2fzU9gl}, month = {11}, id = {7aff4723-2a2e-34ad-8911-0b3e35949ee6}, created = {2022-03-28T09:45:06.567Z}, accessed = {2021-12-15}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:20.970Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {higginsBetaVAELearningBasic2016}, source_type = {article}, short_title = {beta-VAE}, private_publication = {false}, abstract = {We introduce beta-VAE, a new state-of-the-art framework for automated discovery of interpretable factorised latent representations from raw image data in a completely unsupervised manner.}, bibtype = {article}, author = {Higgins, Irina and Matthey, Loic and Pal, Arka and Burgess, Christopher and Glorot, Xavier and Botvinick, Matthew and Mohamed, Shakir and Lerchner, Alexander} }
@article{ title = {Importance Weighted Autoencoders}, type = {article}, year = {2016}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1509.00519}, month = {11}, id = {e923d7a3-6a29-3392-8024-7974bad4e437}, created = {2022-03-28T09:45:06.625Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:42.554Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {burdaImportanceWeightedAutoencoders2016}, source_type = {article}, notes = {arXiv: 1509.00519}, private_publication = {false}, abstract = {The variational autoencoder (VAE; Kingma, Welling (2014)) is a recently proposed generative model pairing a top-down generative network with a bottom-up recognition network which approximates posterior inference. It typically makes strong assumptions about posterior inference, for instance that the posterior distribution is approximately factorial, and that its parameters can be approximated with nonlinear regression from the observations. As we show empirically, the VAE objective can lead to overly simplified representations which fail to use the network's entire modeling capacity. We present the importance weighted autoencoder (IWAE), a generative model with the same architecture as the VAE, but which uses a strictly tighter log-likelihood lower bound derived from importance weighting. In the IWAE, the recognition network uses multiple samples to approximate the posterior, giving it increased flexibility to model complex posteriors which do not fit the VAE modeling assumptions. We show empirically that IWAEs learn richer latent space representations than VAEs, leading to improved test log-likelihood on density estimation benchmarks.}, bibtype = {article}, author = {Burda, Yuri and Grosse, Roger and Salakhutdinov, Ruslan}, journal = {arXiv:1509.00519 [cs, stat]} }
@article{ title = {Bayesian network modeling of early growth stages explains yam interplant yield variability and allows for agronomic improvements in West Africa}, type = {article}, year = {2016}, keywords = {Additive Bayesian network modeling,Cataphyll,Early growth,Vegetatively propagated crops,Yam (Dioscorea spp.),Yield variability}, pages = {80-88}, volume = {75}, websites = {http://dx.doi.org/10.1016/j.eja.2016.01.009}, publisher = {Elsevier B.V.}, id = {d655b9ab-5d1b-34d2-9468-c2c9cc40d99a}, created = {2022-04-05T05:35:07.923Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:07.923Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Yams (Dioscorea spp.) are important species, especially for resource-poor farmers of West Africa, where crop yields are affected by early plant size hierarchy linked with uneven emergence. Although the causes of this phenomenon are not fully known, yams, like other vegetatively propagated crops, have heavy planting material that is liable to induce such interplant variability. In addition, planting practices may mitigate this phenomenon via the selection of the seed-tuber size or state. To gain further insight into yam interplant variability, this study identified and quantified, for the first time, the direct and indirect dependency between planting practices, early growth variables and yield components of Dioscorea rotundata and Dioscorea alata, the two main food yam species. The experimental dataset came from six field trials carried out in Benin at two locations between 2007 and 2009. Additive Bayesian network modeling was used for structure discovery-its directed acyclic graph offers an ideal background for discussing complex systems when theoretical knowledge is lacking, e.g., for yams. Here we found that the emergence date was the only direct cause of plant yield variability common to both species. For D. rotundata, we observed a direct contribution of the cataphyll number to the plant tuber weight. These combined results suggest the existence of some uncontrolled latent variables (i.e., seed-tuber physiological age and reserves). For D. alata, the model did not reveal any effect of seed-tuber size, despite a strong effect noted for D. rotundata. We suggest that the transposition of traditional native D. rotundata planting practices may have led to oversized D. alata seed-tubers, resulting in wastage of planting material. This study demonstrated that traditional West African cropping systems have a serious drawback concerning the uncontrolled wide range of physiological ages and reserves in seed-tuber lots, which affect the plant size hierarchy and ultimately the marketable yield.}, bibtype = {article}, author = {Cornet, Denis and Sierra, Jorge and Tournebize, Régis and Gabrielle, Benoît and Lewis, Fraser I.}, doi = {10.1016/j.eja.2016.01.009}, journal = {European Journal of Agronomy} }
@article{ title = {Determining the shape of agricultural materials using spherical harmonics}, type = {article}, year = {2016}, keywords = {Agricultural grains,Complex-shaped particles,Spherical harmonics}, pages = {160-171}, volume = {128}, id = {a01a60cf-c10d-3b5c-8a4f-a57c26fbbde9}, created = {2023-04-24T07:38:01.277Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:55.138Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Radvilaite2016}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {Determining the shape of agricultural products is important in many areas of the food industry, such as for classification or quality inspection. The characterization methods used to describe the behaviour of particulate systems during handling may also benefit from an accurate description of the particle shapes. Several methods for particle shape representation have been proposed, including those using super-quadric equations, polygon formulations, or composite particles. However, it has been proved that these methods are not accurate enough to represent complex-shaped particles. The use of spherical harmonics has recently received increasing attention for this purpose. In this paper, spherical harmonics are used to obtain the shapes of three agricultural grains, namely bean, chickpea, and maize, which differ in complexity. Once it has been proved that spherical harmonics can accurately describe the shapes of agricultural grains, the advantages and disadvantages of this technique are discussed. Furthermore, the relationship between spherical harmonics and the discrete element method for the simulation of particle systems is also discussed.}, bibtype = {article}, author = {Radvilaitė, Urtė and Ramírez-Gómez, Álvaro and Kačianauskas, Rimantas}, doi = {10.1016/j.compag.2016.09.003}, journal = {Computers and Electronics in Agriculture} }
@article{ title = {Determining the shape of agricultural materials using spherical harmonics}, type = {article}, year = {2016}, keywords = {Agricultural grains,Complex-shaped particles,Spherical harmonics}, pages = {160-171}, volume = {128}, id = {f79554c3-2345-338f-83ec-c211dbec0002}, created = {2023-04-24T07:38:01.282Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:28.102Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Radvilaite2016a}, private_publication = {false}, abstract = {Determining the shape of agricultural products is important in many areas of the food industry, such as for classification or quality inspection. The characterization methods used to describe the behaviour of particulate systems during handling may also benefit from an accurate description of the particle shapes. Several methods for particle shape representation have been proposed, including those using super-quadric equations, polygon formulations, or composite particles. However, it has been proved that these methods are not accurate enough to represent complex-shaped particles. The use of spherical harmonics has recently received increasing attention for this purpose. In this paper, spherical harmonics are used to obtain the shapes of three agricultural grains, namely bean, chickpea, and maize, which differ in complexity. Once it has been proved that spherical harmonics can accurately describe the shapes of agricultural grains, the advantages and disadvantages of this technique are discussed. Furthermore, the relationship between spherical harmonics and the discrete element method for the simulation of particle systems is also discussed.}, bibtype = {article}, author = {Radvilaitė, Urtė and Ramírez-Gómez, Álvaro and Kačianauskas, Rimantas}, doi = {10.1016/j.compag.2016.09.003}, journal = {Computers and Electronics in Agriculture} }
@article{ title = {Determining the shape of agricultural materials using spherical harmonics}, type = {article}, year = {2016}, pages = {160-171}, volume = {128}, id = {33791a36-e6f8-3431-998c-2b0e4531cd5d}, created = {2023-05-03T12:06:03.336Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:28.288Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ramirez-gomez2016}, private_publication = {false}, bibtype = {article}, author = {Ramírez-gómez, Álvaro and Kac, Rimantas}, doi = {10.1016/j.compag.2016.09.003} }
@article{ title = {Camera rotation estimation using 3D mesh surfaces representation of spherical images}, type = {article}, year = {2016}, pages = {2514-2520}, volume = {2016-Novem}, publisher = {IEEE}, id = {6a0d8a27-b549-3a82-a4eb-4f5be69b0102}, created = {2023-05-03T13:16:39.873Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.322Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Benseddik2016a}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This paper presents a novel rotation estimation approach based on 3D mesh representation of spherical images. Indeed, unit sphere is becoming a natural space for projecting images captured from the central cameras (conventional and non-conventional cameras, as omnidirectional camera) and obtaining the spherical images. The proposed method relies on representing the spherical images into a 3D space based on image intensities. Spherical harmonic coefficients are then calculated for the 3D mesh surfaces and used to estimate an initial rotation between the underlying spherical images in spectral domain. The optimal rotation is then refined through the ICP algorithm. Experimental results, using synthetic and real image dataset, demonstrate the effectiveness of the proposed approach for rotation estimation, as well as its robustness against real conditions and images occlusion. A comparison between the proposed method and competitive ones, is performed.}, bibtype = {article}, author = {Benseddik, Houssem Eddine and Hadj-Abdelkader, Hicham and Cherki, Brahim and Bouchafa, Samia}, doi = {10.1109/IROS.2016.7759391}, journal = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {3D mesh-based representation of spherical images for dense rotation estimation}, type = {article}, year = {2016}, keywords = {3D mesh,Rotation estimation,geometric representation,spherical harmonics decomposition,spherical topology}, pages = {13-15}, volume = {2016}, publisher = {IEEE}, id = {7f156920-d289-3166-8e91-995162f9cd77}, created = {2023-05-03T13:16:39.978Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-15T08:10:14.352Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Benseddik2016}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In this paper, we propose a dense approach for 3D rotation estimation between spherical images, which is simultaneously able to recover the large rotations, robust under clutter and small translations. The key idea is to represent the spherical images by 3D shapes of the triangular mesh surfaces based on image intensity signal. This allows to apply the spherical harmonics representation as 3D shape descriptor. The optimum rotation computation is recovered through the SVD decomposition of the cross covariance matrix, which is obtained from the two 3D shapes spherical harmonics coefficients. The performances of the proposed approach are examined using both synthetic and real image datasets. Experimental results show the effectiveness of our approach for rotation estimation, as well as its robustness against real conditions, image occlusions and small translations. The efficiency of the proposed approach is compared with that of competitive methods.}, bibtype = {article}, author = {Benseddik, Houssem Eddine and Hadj-Abdelkader, Hicham and Cherki, Brahim and Bouchafa, Samia}, doi = {10.1109/ICARCV.2016.7838756}, journal = {2016 14th International Conference on Control, Automation, Robotics and Vision, ICARCV 2016}, number = {November} }
@article{ title = {VoxNet: A 3D Convolutional Neural Network for Real-Time Object Recognition}, type = {article}, year = {2015}, pages = {922-928}, id = {2a9ae61b-a81e-3450-a78a-1b3df85504a8}, created = {2020-10-01T13:48:35.417Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T08:25:30.932Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Maturana2015}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Robust object recognition is a crucial skill for robots operating autonomously in real world environments. Range sensors such as LiDAR and RGBD cameras are in- creasingly found in modern robotic systems, providing a rich source of 3D information that can aid in this task. However, many current systems do not fully utilize this information and have trouble efficiently dealing with large amounts of point cloud data. In this paper, we propose VoxNet, an architecture to tackle this problem by integrating a volumetric Occupancy Grid representation with a supervised 3D Convolutional Neural Network (3D CNN). We evaluate our approach on publicly available benchmarks using LiDAR, RGBD, and CAD data. VoxNet achieves accuracy beyond the state of the art while labeling hundreds of instances per second.}, bibtype = {article}, author = {Maturana, Daniel and Scherer, Sebastian}, doi = {10.1109/IROS.2015.7353481}, journal = {Iros} }
@inproceedings{ title = {Efficient algorithms for Next Best View evaluation}, type = {inproceedings}, year = {2015}, id = {043f17a6-e707-3e49-bc82-12b14d286d4b}, created = {2020-10-05T10:26:00.758Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.003Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bissmarck2015}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,990ab628-0917-4e89-b071-24bf1f44fad6,07e07de9-bcac-4934-a82b-d0aff540e56d,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {A Next Best View estimate may guide processes of 3D reconstruction and exploration to completeness within reasonable time. For the evaluation to be useful, the Next Best View computation itself must be effective in terms of time and accuracy. It needs to be model-free to hold for any geometry of the 3D scene. In this work, we compare the effectiveness of different approaches to Next Best View evaluation. A 3D occupancy grid map, allowing for fast lookup and ray casting, serves as a foundation for our evaluation. We tested naive, state-of-the-art and novel algorithms on data acquired from both indoor and outdoor environments. We demonstrate that the most effective volumetric algorithm is a novel one that exploits spatial hierarchy, utilizes frontiers, and avoids redundant ray casting.}, bibtype = {inproceedings}, author = {Bissmarck, Fredrik and Svensson, Martin and Tolt, Gustav}, doi = {10.1109/IROS.2015.7354212}, booktitle = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {A novel way to organize 3D LiDAR point cloud as 2D depth map height map and surface normal map}, type = {article}, year = {2015}, pages = {1383-1388}, publisher = {IEEE}, id = {8bec5742-43b2-3906-944f-eb393e194611}, created = {2020-11-13T11:34:36.721Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:36:09.813Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper we focus on what meaningful 2D perceptual information we can get from 3D LiDAR point cloud. Current work [1] [2] [3] have demonstrated that the depth, height and local surface normal value of a 3D data are useful features for improving Deep Neural Networks (DNNs) based object detection. We thus propose to organise LiDAR point as three different maps: dense depth map, height map and surface normal map. Specifically, given a pair of RGB image and sparse depth map projected from LiDAR point cloud, we propose a parameter self-adaptive method to upgrade sparse depth map to dense depth map, which is then passed to a convex optimisation framework to gain global enhancement. Height map is obtained by reprojecting each pixel in dense depth map into 3D coordinate, which enables us to record its height value, surface normal map is obtained by a trilateral filter constructed from depth map and RGB image. Finally, we validate our framework on both KITTI tracking dataset and Middlebury dataset1. To the best of our knowledge, we are the first to interpret 3D LiDAR point cloud as various 2D features and hope it will motivate more research on object detection by combing RGB image and 3D LiDAR point cloud.}, bibtype = {article}, author = {He, Yuhang and Chen, Long and Chen, Jianda and Li, Ming}, doi = {10.1109/ROBIO.2015.7418964}, journal = {2015 IEEE International Conference on Robotics and Biomimetics, IEEE-ROBIO 2015} }
@article{ title = {Estimating Surface Normals with Depth Image Gradients for Fast and Accurate Registration}, type = {article}, year = {2015}, keywords = {Accuracy,Cameras,Computational efficiency,Estimation,Surface treatment,Three-dimensional displays,Yttrium}, pages = {640-647}, publisher = {IEEE}, id = {d5d62d48-e4bd-3bb8-a0c1-823f1d629dfc}, created = {2020-11-16T10:05:24.738Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-16T10:05:30.990Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {We present a fast registration framework with estimating surface normals from depth images. The key component in the framework is to utilize adjacent pixels and compute the normal at each pixel on a depth image by following three steps. First, image gradients on a depth image are computed with a 2D differential filtering. Next, two 3D gradient vectors are computed from horizontal and vertical depth image gradients. Finally, the normal vector is obtained from the cross product of the 3D gradient vectors. Since horizontal and vertical adjacent pixels at each pixel are considered composing a local 3D plane, the 3D gradient vectors are equivalent to tangent vectors of the plane. Compared with existing normal estimation based on fitting a plane to a point cloud, our depth image gradients based normal estimation is extremely faster because it needs only a few mathematical operations. We apply it to normal space sampling based 3D registration and validate the effectiveness of our registration framework by evaluating its accuracy and computational cost with a public dataset.}, bibtype = {article}, author = {Nakagawa, Yosuke and Uchiyama, Hideaki and Nagahara, Hajime and Taniguchi, Rin Ichiro}, doi = {10.1109/3DV.2015.80}, journal = {Proceedings - 2015 International Conference on 3D Vision, 3DV 2015} }
@article{ title = {Designing deep networks for surface normal estimation}, type = {article}, year = {2015}, pages = {539-547}, volume = {07-12-June}, id = {7a79a229-f858-353f-bdfe-a450e370ac14}, created = {2020-11-18T12:40:59.442Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-18T12:41:02.796Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In the past few years, convolutional neural nets (CNN) have shown incredible promise for learning visual representations. In this paper, we use CNNs for the task of predicting surface normals from a single image. But what is the right architecture? We propose to build upon the decades of hard work in 3D scene understanding to design a new CNN architecture for the task of surface normal estimation. We show that incorporating several constraints (man-made, Manhattan world) and meaningful intermediate representations (room layout, edge labels) in the architecture leads to state of the art performance on surface normal estimation. We also show that our network is quite robust and show state of the art results on other datasets as well without any fine-tuning.}, bibtype = {article}, author = {Wang, Xiaolong and Fouhey, David F. and Gupta, Abhinav}, doi = {10.1109/CVPR.2015.7298652}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition}, number = {Figure 2} }
@article{ title = {Multi-view convolutional neural networks for 3D shape recognition}, type = {article}, year = {2015}, pages = {945-953}, volume = {2015 Inter}, id = {0a7a65da-51ed-3993-b272-2cfb587116d0}, created = {2021-01-25T08:45:25.102Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T08:45:42.413Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {A longstanding question in computer vision concerns the representation of 3D shapes for recognition: should 3D shapes be represented with descriptors operating on their native 3D formats, such as voxel grid or polygon mesh, or can they be effectively represented with view-based descriptors? We address this question in the context of learning to recognize 3D shapes from a collection of their rendered views on 2D images. We first present a standard CNN architecture trained to recognize the shapes' rendered views independently of each other, and show that a 3D shape can be recognized even from a single view at an accuracy far higher than using state-of-the-art 3D shape descriptors. Recognition rates further increase when multiple views of the shapes are provided. In addition, we present a novel CNN architecture that combines information from multiple views of a 3D shape into a single and compact shape descriptor offering even better recognition performance. The same architecture can be applied to accurately recognize human hand-drawn sketches of shapes. We conclude that a collection of 2D views can be highly informative for 3D shape recognition and is amenable to emerging CNN architectures and their derivatives.}, bibtype = {article}, author = {Su, Hang and Maji, Subhransu and Kalogerakis, Evangelos and Learned-Miller, Erik}, doi = {10.1109/ICCV.2015.114}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {3D ShapeNets: A deep representation for volumetric shapes}, type = {article}, year = {2015}, pages = {1912-1920}, volume = {07-12-June}, id = {29845779-4f62-3a2f-8282-7473c8bcb4d4}, created = {2021-01-27T10:04:16.692Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.188Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wu2015}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, abstract = {3D shape is a crucial but heavily underutilized cue in today's computer vision systems, mostly due to the lack of a good generic shape representation. With the recent availability of inexpensive 2.5D depth sensors (e.g. Microsoft Kinect), it is becoming increasingly important to have a powerful 3D shape representation in the loop. Apart from category recognition, recovering full 3D shapes from view-based 2.5D depth maps is also a critical part of visual understanding. To this end, we propose to represent a geometric 3D shape as a probability distribution of binary variables on a 3D voxel grid, using a Convolutional Deep Belief Network. Our model, 3D ShapeNets, learns the distribution of complex 3D shapes across different object categories and arbitrary poses from raw CAD data, and discovers hierarchical compositional part representation automatically. It naturally supports joint object recognition and shape completion from 2.5D depth maps, and it enables active object recognition through view planning. To train our 3D deep learning model, we construct ModelNet - a large-scale 3D CAD model dataset. Extensive experiments show that our 3D deep representation enables significant performance improvement over the-state-of-the-arts in a variety of tasks.}, bibtype = {article}, author = {Wu, Zhirong and Song, Shuran and Khosla, Aditya and Yu, Fisher and Zhang, Linguang and Tang, Xiaoou and Xiao, Jianxiong}, doi = {10.1109/CVPR.2015.7298801}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Learning both Weights and Connections for Efficient Neural Networks}, type = {article}, year = {2015}, pages = {1135-1143}, volume = {2015-January}, websites = {http://arxiv.org/abs/1506.02626}, month = {6}, publisher = {Neural information processing systems foundation}, day = {8}, id = {1935585d-d60b-3fe5-a5b6-ff2157b5b57f}, created = {2021-02-09T07:25:26.608Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:55.995Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.}, bibtype = {article}, author = {Han, Song and Pool, Jeff and Tran, John and Dally, William J.}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Fixed Point Quantization of Deep Convolutional Networks}, type = {article}, year = {2015}, pages = {4166-4175}, volume = {6}, websites = {http://arxiv.org/abs/1511.06393}, month = {11}, publisher = {International Machine Learning Society (IMLS)}, day = {19}, id = {21538c2d-a8ae-3179-8b82-28ed9392b63a}, created = {2021-02-09T07:26:31.900Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:38:55.132Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {In recent years increasingly complex architectures for deep convolution networks (DCNs) have been proposed to boost the performance on image recognition tasks. However, the gains in performance have come at a cost of substantial increase in computation and model storage resources. Fixed point implementation of DCNs has the potential to alleviate some of these complexities and facilitate potential deployment on embedded hardware. In this paper, we propose a quantizer design for fixed point implementation of DCNs. We formulate and solve an optimization problem to identify optimal fixed point bit-width allocation across DCN layers. Our experiments show that in comparison to equal bit-width settings, the fixed point DCNs with optimized bit width allocation offer >20% reduction in the model size without any loss in accuracy on CIFAR-10 benchmark. We also demonstrate that fine-tuning can further enhance the accuracy of fixed point DCNs beyond that of the original floating point model. In doing so, we report a new state-of-the-art fixed point performance of 6.78% error-rate on CIFAR-10 benchmark.}, bibtype = {article}, author = {Lin, Darryl D. and Talathi, Sachin S. and Annapureddy, V. Sreekanth}, journal = {33rd International Conference on Machine Learning, ICML 2016} }
@article{ title = {Learning both Weights and Connections for Efficient Neural Networks}, type = {article}, year = {2015}, pages = {1135-1143}, volume = {2015-January}, websites = {http://arxiv.org/abs/1506.02626}, month = {6}, publisher = {Neural information processing systems foundation}, day = {8}, id = {94e00507-34d1-3195-8acc-572e65b1e97b}, created = {2021-02-09T07:45:36.900Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T07:45:39.401Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.}, bibtype = {article}, author = {Han, Song and Pool, Jeff and Tran, John and Dally, William J.}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Fixed Point Quantization of Deep Convolutional Networks}, type = {article}, year = {2015}, pages = {4166-4175}, volume = {6}, websites = {http://arxiv.org/abs/1511.06393}, month = {11}, publisher = {International Machine Learning Society (IMLS)}, day = {19}, id = {1098a933-2b2f-39f9-82d6-7c6175e099b8}, created = {2021-02-09T07:46:04.464Z}, accessed = {2021-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-11T14:12:12.319Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {bc1835e2-32e3-4f2a-b03c-9540bbbd02e0}, private_publication = {false}, abstract = {In recent years increasingly complex architectures for deep convolution networks (DCNs) have been proposed to boost the performance on image recognition tasks. However, the gains in performance have come at a cost of substantial increase in computation and model storage resources. Fixed point implementation of DCNs has the potential to alleviate some of these complexities and facilitate potential deployment on embedded hardware. In this paper, we propose a quantizer design for fixed point implementation of DCNs. We formulate and solve an optimization problem to identify optimal fixed point bit-width allocation across DCN layers. Our experiments show that in comparison to equal bit-width settings, the fixed point DCNs with optimized bit width allocation offer >20% reduction in the model size without any loss in accuracy on CIFAR-10 benchmark. We also demonstrate that fine-tuning can further enhance the accuracy of fixed point DCNs beyond that of the original floating point model. In doing so, we report a new state-of-the-art fixed point performance of 6.78% error-rate on CIFAR-10 benchmark.}, bibtype = {article}, author = {Lin, Darryl D. and Talathi, Sachin S. and Annapureddy, V. Sreekanth}, journal = {33rd International Conference on Machine Learning, ICML 2016} }
@article{ title = {Efficient next-best-scan planning for autonomous 3D surface reconstruction of unknown objects}, type = {article}, year = {2015}, keywords = {3D modeling,Active vision,Laser scanning,Next-best-view planning}, pages = {611-631}, volume = {10}, id = {11c69588-77c0-3164-aacc-5a7c880b22cc}, created = {2021-02-09T17:05:46.769Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:32.619Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kriegel2015}, folder_uuids = {990ab628-0917-4e89-b071-24bf1f44fad6,5439d198-93d5-4603-a7ce-201d423f231e,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {This work focuses on autonomous surface reconstruction of small-scale objects with a robot and a 3D sensor. The aim is a high-quality surface model allowing for robotic applications such as grasping and manipulation. Our approach comprises the generation of next-best-scan (NBS) candidates and selection criteria, error minimization between scan patches and termination criteria. NBS candidates are iteratively determined by a boundary detection and surface trend estimation of the acquired model. To account for both a fast and high-quality model acquisition, that candidate is selected as NBS, which maximizes a utility function that integrates an exploration and a mesh-quality component. The modeling and scan planning methods are evaluated on an industrial robot with a high-precision laser striper system. While performing the new laser scan, data are integrated on-the-fly into both, a triangle mesh and a probabilistic voxel space. The efficiency of the system in fast acquisition of high-quality 3D surface models is proven with different cultural heritage, household and industrial objects.}, bibtype = {article}, author = {Kriegel, Simon and Rink, Christian and Bodenmüller, Tim and Suppa, Michael}, doi = {10.1007/s11554-013-0386-6}, journal = {Journal of Real-Time Image Processing}, number = {4} }
@article{ title = {US009117281B2}, type = {article}, year = {2015}, volume = {2}, id = {eed3392f-c09c-3e36-9b9e-866e6ff7940b}, created = {2021-04-08T11:26:53.391Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T08:24:38.056Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Wight, Steve and Yee, Judy and Minhas, Micky}, number = {12} }
@article{ title = {US009043186B2}, type = {article}, year = {2015}, volume = {2}, id = {97b090c4-aa15-308a-9d74-9dc1726d6435}, created = {2021-04-15T08:24:37.788Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:11.083Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Yee, Judy}, number = {12} }
@article{ title = {Learning both Weights and Connections for Efficient Neural Networks}, type = {article}, year = {2015}, pages = {1135-1143}, volume = {2015-January}, websites = {http://arxiv.org/abs/1506.02626}, month = {6}, publisher = {Neural information processing systems foundation}, day = {8}, id = {3f5fde5b-3066-3122-9b13-559a1378eacf}, created = {2021-06-14T08:23:46.972Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:31:43.912Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4,cf9189f6-f354-4337-8aaf-a5f12cbf8660}, private_publication = {false}, abstract = {Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.}, bibtype = {article}, author = {Han, Song and Pool, Jeff and Tran, John and Dally, William J.}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {Fixed Point Quantization of Deep Convolutional Networks}, type = {article}, year = {2015}, pages = {4166-4175}, volume = {6}, websites = {http://arxiv.org/abs/1511.06393}, month = {11}, publisher = {International Machine Learning Society (IMLS)}, day = {19}, id = {d7515001-2b52-3056-b6be-5083f956188c}, created = {2021-06-14T08:26:23.197Z}, accessed = {2021-06-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-14T08:31:46.321Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c9e2a751-ce83-45dd-9c0e-bdac57df3cf4,cf9189f6-f354-4337-8aaf-a5f12cbf8660}, private_publication = {false}, abstract = {In recent years increasingly complex architectures for deep convolution networks (DCNs) have been proposed to boost the performance on image recognition tasks. However, the gains in performance have come at a cost of substantial increase in computation and model storage resources. Fixed point implementation of DCNs has the potential to alleviate some of these complexities and facilitate potential deployment on embedded hardware. In this paper, we propose a quantizer design for fixed point implementation of DCNs. We formulate and solve an optimization problem to identify optimal fixed point bit-width allocation across DCN layers. Our experiments show that in comparison to equal bit-width settings, the fixed point DCNs with optimized bit width allocation offer >20% reduction in the model size without any loss in accuracy on CIFAR-10 benchmark. We also demonstrate that fine-tuning can further enhance the accuracy of fixed point DCNs beyond that of the original floating point model. In doing so, we report a new state-of-the-art fixed point performance of 6.78% error-rate on CIFAR-10 benchmark.}, bibtype = {article}, author = {Lin, Darryl D. and Talathi, Sachin S. and Annapureddy, V. Sreekanth}, journal = {33rd International Conference on Machine Learning, ICML 2016} }
@article{ title = {Explaining and harnessing adversarial examples}, type = {article}, year = {2015}, pages = {1-11}, id = {804cfb14-cf8a-31fa-80bc-a46104f17537}, created = {2021-07-12T14:15:35.034Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:41.559Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Several machine learning models, including neural networks, consistently misclassify adversarial examples—inputs formed by applying small but intentionally worst-case perturbations to examples from the dataset, such that the perturbed input results in the model outputting an incorrect answer with high confidence. Early attempts at explaining this phenomenon focused on nonlinearity and overfitting. We argue instead that the primary cause of neural networks’ vulnerability to adversarial perturbation is their linear nature. This explanation is supported by new quantitative results while giving the first explanation of the most intriguing fact about them: their generalization across architectures and training sets. Moreover, this view yields a simple and fast method of generating adversarial examples. Using this approach to provide examples for adversarial training, we reduce the test set error of a maxout network on the MNIST dataset.}, bibtype = {article}, author = {Goodfellow, Ian J. and Shlens, Jonathon and Szegedy, Christian}, journal = {3rd International Conference on Learning Representations, ICLR 2015 - Conference Track Proceedings} }
@article{ title = {Batch normalization: Accelerating deep network training by reducing internal covariate shift}, type = {article}, year = {2015}, pages = {448-456}, volume = {1}, id = {2655bb4c-acbe-3fb5-9386-9eb8b152ad29}, created = {2021-07-12T14:15:35.047Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:01.811Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization, and in some cases eliminates the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.82% top-5 test error, exceeding the accuracy of human raters.}, bibtype = {article}, author = {Ioffe, Sergey and Szegedy, Christian}, journal = {32nd International Conference on Machine Learning, ICML 2015} }
@article{ title = {Unitary Evolution Recurrent Neural Networks}, type = {article}, year = {2015}, volume = {48}, websites = {http://arxiv.org/abs/1511.06464}, id = {f3991993-6eb0-3a7e-ba6d-400bf7ada534}, created = {2021-07-12T14:15:35.398Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:12.537Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Recurrent neural networks (RNNs) are notoriously difficult to train. When the eigenvalues of the hidden to hidden weight matrix deviate from absolute value 1, optimization becomes difficult due to the well studied issue of vanishing and exploding gradients, especially when trying to learn long-term dependencies. To circumvent this problem, we propose a new architecture that learns a unitary weight matrix, with eigenvalues of absolute value exactly 1. The challenge we address is that of parametrizing unitary matrices in a way that does not require expensive computations (such as eigendecomposition) after each weight update. We construct an expressive unitary weight matrix by composing several structured matrices that act as building blocks with parameters to be learned. Optimization with this parameterization becomes feasible only when considering hidden states in the complex domain. We demonstrate the potential of this architecture by achieving state of the art results in several hard tasks involving very long-term dependencies.}, bibtype = {article}, author = {Arjovsky, Martin and Shah, Amar and Bengio, Yoshua} }
@article{ title = {Qualitatively characterizing neural network optimization problems}, type = {article}, year = {2015}, id = {91b75099-9bf1-38f8-ad66-fd746c887211}, created = {2021-07-12T14:15:35.863Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:50.808Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Training neural networks involves solving large-scale non-convex optimization problems. This task has long been believed to be extremely difficult, with fear of local minima and other obstacles motivating a variety of schemes to improve optimization, such as unsupervised pretraining. However, modern neural networks are able to achieve negligible training error on complex tasks, using only direct training with stochastic gradient descent. We introduce a simple analysis technique to look for evidence that such networks are overcoming local optima. We find that, in fact, on a straight path from initialization to solution, a variety of state of the art neural networks never encounter any significant obstacles.}, bibtype = {article}, author = {Goodfellow, Ian J. and Vinyals, Oriol and Saxe, Andrew M.}, journal = {3rd International Conference on Learning Representations, ICLR 2015 - Conference Track Proceedings}, number = {November} }
@article{ title = {3D ShapeNets: A deep representation for volumetric shapes}, type = {article}, year = {2015}, pages = {1912-1920}, volume = {07-12-June}, id = {a0d1d401-c93f-34c8-b08d-11b3626e7c61}, created = {2021-07-26T12:19:39.684Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:45.070Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wu2015}, folder_uuids = {990ab628-0917-4e89-b071-24bf1f44fad6,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {3D shape is a crucial but heavily underutilized cue in today's computer vision systems, mostly due to the lack of a good generic shape representation. With the recent availability of inexpensive 2.5D depth sensors (e.g. Microsoft Kinect), it is becoming increasingly important to have a powerful 3D shape representation in the loop. Apart from category recognition, recovering full 3D shapes from view-based 2.5D depth maps is also a critical part of visual understanding. To this end, we propose to represent a geometric 3D shape as a probability distribution of binary variables on a 3D voxel grid, using a Convolutional Deep Belief Network. Our model, 3D ShapeNets, learns the distribution of complex 3D shapes across different object categories and arbitrary poses from raw CAD data, and discovers hierarchical compositional part representation automatically. It naturally supports joint object recognition and shape completion from 2.5D depth maps, and it enables active object recognition through view planning. To train our 3D deep learning model, we construct ModelNet - a large-scale 3D CAD model dataset. Extensive experiments show that our 3D deep representation enables significant performance improvement over the-state-of-the-arts in a variety of tasks.}, bibtype = {article}, author = {Wu, Zhirong and Song, Shuran and Khosla, Aditya and Yu, Fisher and Zhang, Linguang and Tang, Xiaoou and Xiao, Jianxiong}, doi = {10.1109/CVPR.2015.7298801}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {ShapeNet: An Information-Rich 3D Model Repository}, type = {article}, year = {2015}, websites = {http://arxiv.org/abs/1512.03012}, id = {2b5dd6fe-3e62-3502-a9b5-f7238be08539}, created = {2021-07-29T15:21:59.046Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-29T15:22:01.737Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chang2015}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {We present ShapeNet: a richly-annotated, large-scale repository of shapes represented by 3D CAD models of objects. ShapeNet contains 3D models from a multitude of semantic categories and organizes them under the WordNet taxonomy. It is a collection of datasets providing many semantic annotations for each 3D model such as consistent rigid alignments, parts and bilateral symmetry planes, physical sizes, keywords, as well as other planned annotations. Annotations are made available through a public web-based interface to enable data visualization of object attributes, promote data-driven geometric analysis, and provide a large-scale quantitative benchmark for research in computer graphics and vision. At the time of this technical report, ShapeNet has indexed more than 3,000,000 models, 220,000 models out of which are classified into 3,135 categories (WordNet synsets). In this report we describe the ShapeNet effort as a whole, provide details for all currently available datasets, and summarize future plans.}, bibtype = {article}, author = {Chang, Angel X. and Funkhouser, Thomas and Guibas, Leonidas and Hanrahan, Pat and Huang, Qixing and Li, Zimo and Savarese, Silvio and Savva, Manolis and Song, Shuran and Su, Hao and Xiao, Jianxiong and Yi, Li and Yu, Fisher} }
@article{ title = {Geodesic Convolutional Neural Networks on Riemannian Manifolds}, type = {article}, year = {2015}, keywords = {Eigenvalues and eigenfunctions,Geometry,Heating,Kernel,Manifolds,Neural networks,Shape}, pages = {832-840}, volume = {2015-Febru}, id = {a3ad92a6-e217-3684-a224-d32ff7254e6c}, created = {2021-08-28T19:32:57.440Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-29T21:49:17.157Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Feature descriptors play a crucial role in a wide range of geometry analysis and processing applications, including shape correspondence, retrieval, and segmentation. In this paper, we introduce Geodesic Convolutional Neural Networks (GCNN), a generalization of the convolutional neural networks (CNN) paradigm to non-Euclidean manifolds. Our construction is based on a local geodesic system of polar coordinates to extract "patches", which are then passed through a cascade of filters and linear and non-linear operators. The coefficients of the filters and linear combination weights are optimization variables that are learned to minimize a task-specific cost function. We use ShapeNet to learn invariant shape features, allowing to achieve state-of-The-Art performance in problems such as shape description, retrieval, and correspondence.}, bibtype = {article}, author = {Masci, Jonathan and Boscaini, Davide and Bronstein, Michael M. and Vandergheynst, Pierre}, doi = {10.1109/ICCVW.2015.112}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Variational Inference with Normalizing Flows}, type = {article}, year = {2015}, volume = {37}, id = {e1a1b51f-bb3c-308a-8e0f-d2808ea5ed32}, created = {2021-09-20T11:01:52.560Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.012Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Com2015}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Com, Shakir Google} }
@article{ title = {Markov Chain Monte Carlo and variational inference: Bridging the gap}, type = {article}, year = {2015}, pages = {1218-1226}, volume = {2}, id = {f366b19f-953d-3aa2-9093-c215861fbf09}, created = {2021-09-29T06:15:29.620Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.500Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Salimans2015}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Recent advances in stochastic gradient variational inference have made it possible to perform variational Bayesian inference with posterior approximations containing auxiliary random variables. This enables us to explore a new synthesis of variational inference and Monte Carlo methods where we incorporate one or more steps of MCMC into our variational approximation. By doing so we obtain a rich class of inference algorithms bridging the gap between variational methods and MCMC, and offering the best of both worlds: fast posterior approximation through the maximization of an explicit objective, with the option of trading off additional computation for additional accuracy. We describe the theoretical foundations that make this possible and show some promising first results.}, bibtype = {article}, author = {Salimans, Tim and Kingma, Diederik P. and Welling, Max}, journal = {32nd International Conference on Machine Learning, ICML 2015}, number = {Mcmc} }
@article{ title = {Adversarial Autoencoders}, type = {article}, year = {2015}, websites = {http://arxiv.org/abs/1511.05644}, id = {ddca915a-2df3-317f-9c40-ebbddcefef55}, created = {2021-10-01T11:39:50.424Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.684Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Makhzani2015}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {In this paper, we propose the "adversarial autoencoder" (AAE), which is a probabilistic autoencoder that uses the recently proposed generative adversarial networks (GAN) to perform variational inference by matching the aggregated posterior of the hidden code vector of the autoencoder with an arbitrary prior distribution. Matching the aggregated posterior to the prior ensures that generating from any part of prior space results in meaningful samples. As a result, the decoder of the adversarial autoencoder learns a deep generative model that maps the imposed prior to the data distribution. We show how the adversarial autoencoder can be used in applications such as semi-supervised classification, disentangling style and content of images, unsupervised clustering, dimensionality reduction and data visualization. We performed experiments on MNIST, Street View House Numbers and Toronto Face datasets and show that adversarial autoencoders achieve competitive results in generative modeling and semi-supervised classification tasks.}, bibtype = {article}, author = {Makhzani, Alireza and Shlens, Jonathon and Jaitly, Navdeep and Goodfellow, Ian and Frey, Brendan} }
@article{ title = {Very deep convolutional networks for large-scale image recognition}, type = {article}, year = {2015}, pages = {1-14}, id = {66023775-a7b1-3c01-9142-bdf478ea9c1d}, created = {2021-11-01T10:14:38.920Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-21T08:32:29.142Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Simonyan2015}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3 × 3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16–19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.}, bibtype = {article}, author = {Simonyan, Karen and Zisserman, Andrew}, journal = {3rd International Conference on Learning Representations, ICLR 2015 - Conference Track Proceedings} }
@article{ title = {Spatial transformer networks}, type = {article}, year = {2015}, pages = {2017-2025}, volume = {2015-Janua}, id = {1836ca8a-16a9-3cdb-85c8-4255c414124e}, created = {2022-01-13T07:21:01.610Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-13T07:21:08.480Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {d54ba66b-a8cf-41de-8e2d-c3256f322e07}, private_publication = {false}, abstract = {Convolutional Neural Networks define an exceptionally powerful class of models, but are still limited by the lack of ability to be spatially invariant to the input data in a computationally and parameter efficient manner.In this work we introduce a new learnable module, the Spatial Transformer, which explicitly allows the spatial manipulation of data within the network. This differentiable module can be inserted into existing convolutional architectures, giving neural networks the ability to actively spatially transform feature maps, conditional on the feature map itself, without any extra training supervision or modification to the optimisation process. We show that the use of spatial transformers results in models which learn invariance to translation, scale, rotation and more generic warping, resulting in state-of-the-art performance on several benchmarks, and for a number of classes of transformations.}, bibtype = {article}, author = {Jaderberg, Max and Simonyan, Karen and Zisserman, Andrew and Kavukcuoglu, Koray}, journal = {Advances in Neural Information Processing Systems} }
@article{ title = {NICE: Non-linear Independent Components Estimation}, type = {article}, year = {2015}, keywords = {Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1410.8516}, month = {4}, id = {17df940f-4e1e-3eea-b297-760d89e1dd8a}, created = {2022-03-28T09:45:00.797Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:53.184Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {dinhNICENonlinearIndependent2015}, source_type = {article}, short_title = {NICE}, notes = {arXiv: 1410.8516}, private_publication = {false}, abstract = {We propose a deep learning framework for modeling complex high-dimensional densities called Non-linear Independent Component Estimation (NICE). It is based on the idea that a good representation is one in which the data has a distribution that is easy to model. For this purpose, a non-linear deterministic transformation of the data is learned that maps it to a latent space so as to make the transformed data conform to a factorized distribution, i.e., resulting in independent latent variables. We parametrize this transformation so that computing the Jacobian determinant and inverse transform is trivial, yet we maintain the ability to learn complex non-linear transformations, via a composition of simple building blocks, each based on a deep neural network. The training criterion is simply the exact log-likelihood, which is tractable. Unbiased ancestral sampling is also easy. We show that this approach yields good generative models on four image datasets and can be used for inpainting.}, bibtype = {article}, author = {Dinh, Laurent and Krueger, David and Bengio, Yoshua}, journal = {arXiv:1410.8516 [cs]} }
@inproceedings{ title = {Unsupervised Learning of Video Representations using LSTMs}, type = {inproceedings}, year = {2015}, pages = {843-852}, websites = {https://proceedings.mlr.press/v37/srivastava15.html}, month = {6}, publisher = {PMLR}, id = {4177d771-9ea3-3d53-ba05-799479f4ad2e}, created = {2022-03-28T09:45:00.807Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:07.748Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {srivastavaUnsupervisedLearningVideo2015}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {We use Long Short Term Memory (LSTM) networks to learn representations of video sequences. Our model uses an encoder LSTM to map an input sequence into a fixed length representation. This representation is decoded using single or multiple decoder LSTMs to perform different tasks, such as reconstructing the input sequence, or predicting the future sequence. We experiment with two kinds of input sequences – patches of image pixels and high-level representations (“percepts") of video frames extracted using a pretrained convolutional net. We explore different design choices such as whether the decoder LSTMs should condition on the generated output. We analyze the outputs of the model qualitatively to see how well the model can extrapolate the learned video representation into the future and into the past. We further evaluate the representations by finetuning them for a supervised learning problem – human action recognition on the UCF-101 and HMDB-51 datasets. We show that the representations help improve classification accuracy, especially when there are only few training examples. Even models pretrained on unrelated datasets (300 hours of YouTube videos) can help action recognition performance.}, bibtype = {inproceedings}, author = {Srivastava, Nitish and Mansimov, Elman and Salakhudinov, Ruslan}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning} }
@inproceedings{ title = {DRAW: A Recurrent Neural Network For Image Generation}, type = {inproceedings}, year = {2015}, pages = {1462-1471}, websites = {https://proceedings.mlr.press/v37/gregor15.html}, month = {6}, publisher = {PMLR}, id = {3f92f8d4-7878-33fa-bf1e-cf8aeaf9229a}, created = {2022-03-28T09:45:00.921Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:20.353Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gregorDRAWRecurrentNeural2015}, source_type = {inproceedings}, short_title = {DRAW}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {This paper introduces the Deep Recurrent Attentive Writer (DRAW) architecture for image generation with neural networks. DRAW networks combine a novel spatial attention mechanism that mimics the foveation of the human eye, with a sequential variational auto-encoding framework that allows for the iterative construction of complex images. The system substantially improves on the state of the art for generative models on MNIST, and, when trained on the Street View House Numbers dataset, it is able to generate images that are indistinguishable from real data with the naked eye.}, bibtype = {inproceedings}, author = {Gregor, Karol and Danihelka, Ivo and Graves, Alex and Rezende, Danilo and Wierstra, Daan}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning} }
@inproceedings{ title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}, type = {inproceedings}, year = {2015}, pages = {448-456}, websites = {https://proceedings.mlr.press/v37/ioffe15.html}, month = {6}, publisher = {PMLR}, id = {6fd1468a-478b-3446-ad50-1ca9a2f176c5}, created = {2022-03-28T09:45:01.043Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:48.127Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {ioffeBatchNormalizationAccelerating2015}, source_type = {inproceedings}, short_title = {Batch Normalization}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer’s inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization, and in some cases eliminates the need for Dropout. Applied to a stateof-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.82\% top-5 test error, exceeding the accuracy of human raters.}, bibtype = {inproceedings}, author = {Ioffe, Sergey and Szegedy, Christian}, booktitle = {Proceedings of the 32nd International Conference on Machine Learning} }
@inproceedings{ title = {Unsupervised Generation of a Viewpoint Annotated Car Dataset From Videos}, type = {inproceedings}, year = {2015}, pages = {1314-1322}, websites = {https://www.cv-foundation.org/openaccess/content_iccv_2015/html/Sedaghat_Unsupervised_Generation_of_ICCV_2015_paper.html}, id = {9d7e53b6-cce3-31d4-9eec-5db23b117c30}, created = {2022-03-28T09:45:01.257Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:17.048Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sedaghatUnsupervisedGenerationViewpoint2015}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Sedaghat, Nima and Brox, Thomas} }
@article{ title = {A bi-ventricular cardiac atlas built from 1000+ high resolution MR images of healthy subjects and an analysis of shape and motion}, type = {article}, year = {2015}, keywords = {Cardiac atlas,Statistical motion model,Statistical parametric mapping,Statistical shape model}, pages = {133-145}, volume = {26}, websites = {https://www.sciencedirect.com/science/article/pii/S1361841515001346}, month = {12}, id = {1fff3a80-02d2-39f7-b288-2c59276de12d}, created = {2022-03-28T09:45:02.730Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:42.779Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {baiBiventricularCardiacAtlas2015}, source_type = {article}, private_publication = {false}, abstract = {Atlases encode valuable anatomical and functional information from a population. In this work, a bi-ventricular cardiac atlas was built from a unique data set, which consists of high resolution cardiac MR images of 1000+ normal subjects. Based on the atlas, statistical methods were used to study the variation of cardiac shapes and the distribution of cardiac motion across the spatio-temporal domain. We have shown how statistical parametric mapping (SPM) can be combined with a general linear model to study the impact of gender and age on regional myocardial wall thickness. Finally, we have also investigated the influence of the population size on atlas construction and atlas-based analysis. The high resolution atlas, the statistical models and the SPM method will benefit more studies on cardiac anatomy and function analysis in the future.}, bibtype = {article}, author = {Bai, Wenjia and Shi, Wenzhe and de Marvao, Antonio and Dawes, Timothy J W and O’Regan, Declan P and Cook, Stuart A and Rueckert, Daniel}, doi = {10.1016/j.media.2015.08.009}, journal = {Medical Image Analysis}, number = {1} }
@inproceedings{ title = {Variational Inference with Normalizing Flows}, type = {inproceedings}, year = {2015}, pages = {1530-1538}, websites = {https://proceedings.mlr.press/v37/rezende15.html}, month = {6}, publisher = {PMLR}, id = {6ddbc125-4ba3-3db5-85c9-70c8d27e37be}, created = {2022-03-28T09:45:04.425Z}, accessed = {2021-09-20}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:19.443Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {rezendeVariationalInferenceNormalizing2015}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, bibtype = {inproceedings}, author = {Rezende, Danilo and Mohamed, Shakir}, booktitle = {International Conference on Machine Learning} }
@inproceedings{ title = {Deep Convolutional Inverse Graphics Network}, type = {inproceedings}, year = {2015}, volume = {28}, websites = {https://proceedings.neurips.cc/paper/2015/hash/ced556cd9f9c0c8315cfbe0744a3baf0-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {d9fab1d5-7d1e-347f-a3b4-c4017291a792}, created = {2022-03-28T09:45:04.650Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:37.138Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kulkarniDeepConvolutionalInverse2015}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Kulkarni, Tejas D and Whitney, William F and Kohli, Pushmeet and Tenenbaum, Josh}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {Deep learning}, type = {article}, year = {2015}, keywords = {Computer science,Mathematics and computing}, pages = {436-444}, volume = {521}, websites = {https://www.nature.com/articles/nature14539}, month = {5}, id = {cb259b2e-743d-337b-bd20-bb15b96a8c42}, created = {2022-03-28T09:45:04.699Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:13.849Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {lecunDeepLearning2015}, source_type = {article}, notes = {Number: 7553<br/>Publisher: Nature Publishing Group}, private_publication = {false}, abstract = {Deep learning allows computational models that are composed of multiple processing layers to learn representations of data with multiple levels of abstraction. These methods have dramatically improved the state-of-the-art in speech recognition, visual object recognition, object detection and many other domains such as drug discovery and genomics. Deep learning discovers intricate structure in large data sets by using the backpropagation algorithm to indicate how a machine should change its internal parameters that are used to compute the representation in each layer from the representation in the previous layer. Deep convolutional nets have brought about breakthroughs in processing images, video, speech and audio, whereas recurrent nets have shone light on sequential data such as text and speech.}, bibtype = {article}, author = {LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey}, doi = {10.1038/nature14539}, journal = {Nature}, number = {7553} }
@inproceedings{ title = {B-SHOT: A binary feature descriptor for fast and efficient keypoint matching on 3D point clouds}, type = {inproceedings}, year = {2015}, keywords = {Bismuth,Detectors,Electronic mail,Histograms,Memory management,Silicon,Three-dimensional displays}, pages = {1929-1934}, month = {9}, id = {440bb114-773e-31cc-819b-bfb8859cfb56}, created = {2022-03-28T09:45:05.362Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:29.198Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {prakhyaBSHOTBinaryFeature2015a}, source_type = {inproceedings}, short_title = {B-SHOT}, private_publication = {false}, abstract = {In this paper, we introduce the very first `binary' 3D feature descriptor, B-SHOT, for fast and efficient keypoint matching on 3D point clouds. We propose a binary quantization method that converts a real valued vector to a binary vector. We apply this method on a state-of-the-art 3D feature descriptor, SHOT [1], and create a new binary 3D feature descriptor. B-SHOT requires 32 times lesser memory for its representation while being 6 times faster in feature descriptor matching, when compared to the SHOT feature descriptor. Experimental evaluation shows that B-SHOT offers comparable keypoint matching performance to that of the state-of-the-art 3D feature descriptors on a standard benchmark dataset.}, bibtype = {inproceedings}, author = {Prakhya, Sai Manoj and Liu, Bingbing and Lin, Weisi}, doi = {10.1109/IROS.2015.7353630}, booktitle = {2015 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)} }
@inproceedings{ title = {Learning Structured Output Representation using Deep Conditional Generative Models}, type = {inproceedings}, year = {2015}, volume = {28}, websites = {https://proceedings.neurips.cc/paper/2015/hash/8d55a249e6baa5c06772297520da2051-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {47386201-858b-3e26-bd3a-6cf327587cb5}, created = {2022-03-28T09:45:06.109Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:19.445Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sohnLearningStructuredOutput2015}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Sohn, Kihyuk and Lee, Honglak and Yan, Xinchen}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {SMPL: a skinned multi-person linear model}, type = {article}, year = {2015}, keywords = {blendshapes,body shape,skinning,soft-tissue}, pages = {248:1--248:16}, volume = {34}, websites = {https://doi.org/10.1145/2816795.2818013}, month = {10}, id = {c2927697-f1b2-3a18-ba02-48844900e67f}, created = {2022-03-28T09:45:06.650Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:39.215Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {loperSMPLSkinnedMultiperson2015}, source_type = {article}, short_title = {SMPL}, private_publication = {false}, abstract = {We present a learned model of human body shape and pose-dependent shape variation that is more accurate than previous models and is compatible with existing graphics pipelines. Our Skinned Multi-Person Linear model (SMPL) is a skinned vertex-based model that accurately represents a wide variety of body shapes in natural human poses. The parameters of the model are learned from data including the rest pose template, blend weights, pose-dependent blend shapes, identity-dependent blend shapes, and a regressor from vertices to joint locations. Unlike previous models, the pose-dependent blend shapes are a linear function of the elements of the pose rotation matrices. This simple formulation enables training the entire model from a relatively large number of aligned 3D meshes of different people in different poses. We quantitatively evaluate variants of SMPL using linear or dual-quaternion blend skinning and show that both are more accurate than a Blend-SCAPE model trained on the same data. We also extend SMPL to realistically model dynamic soft-tissue deformations. Because it is based on blend skinning, SMPL is compatible with existing rendering engines and we make it available for research purposes.}, bibtype = {article}, author = {Loper, Matthew and Mahmood, Naureen and Romero, Javier and Pons-Moll, Gerard and Black, Michael J}, doi = {10.1145/2816795.2818013}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {Analysis and synthesis of 3D shape families via deep-learned generative models of surfaces}, type = {article}, year = {2015}, keywords = {Categories and Subject Descriptors (according to A,I.3.5 Computer Graphics: Computational Geometry a}, pages = {25-38}, volume = {34}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.12694}, id = {c56eccc3-53ba-3d0d-9ac0-8ebd60c3a3c6}, created = {2022-03-28T09:45:06.769Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:56.997Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {huangAnalysisSynthesis3D2015}, source_type = {article}, notes = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.12694}, private_publication = {false}, abstract = {We present a method for joint analysis and synthesis of geometrically diverse 3D shape families. Our method first learns part-based templates such that an optimal set of fuzzy point and part correspondences is computed between the shapes of an input collection based on a probabilistic deformation model. In contrast to previous template-based approaches, the geometry and deformation parameters of our part-based templates are learned from scratch. Based on the estimated shape correspondence, our method also learns a probabilistic generative model that hierarchically captures statistical relationships of corresponding surface point positions and parts as well as their existence in the input shapes. A deep learning procedure is used to capture these hierarchical relationships. The resulting generative model is used to produce control point arrangements that drive shape synthesis by combining and deforming parts from the input collection. The generative model also yields compact shape descriptors that are used to perform fine-grained classification. Finally, it can be also coupled with the probabilistic deformation model to further improve shape correspondence. We provide qualitative and quantitative evaluations of our method for shape correspondence, segmentation, fine-grained classification and synthesis. Our experiments demonstrate superior correspondence and segmentation results than previous state-of-the-art approaches.}, bibtype = {article}, author = {Huang, Haibin and Kalogerakis, Evangelos and Marlin, Benjamin}, doi = {10.1111/cgf.12694}, journal = {Computer Graphics Forum}, number = {5} }
@article{ title = {Dyna: a model of dynamic human shape in motion}, type = {article}, year = {2015}, keywords = {human animation,human shape,motion capture,soft-tissue motion}, pages = {120:1--120:14}, volume = {34}, websites = {https://doi.org/10.1145/2766993}, month = {7}, id = {4f3d3480-46bf-3ba1-aa12-e0cb3d5d4d76}, created = {2022-03-28T09:45:06.770Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:33.665Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {pons-mollDynaModelDynamic2015}, source_type = {article}, short_title = {Dyna}, private_publication = {false}, abstract = {To look human, digital full-body avatars need to have soft-tissue deformations like those of real people. We learn a model of soft-tissue deformations from examples using a high-resolution 4D capture system and a method that accurately registers a template mesh to sequences of 3D scans. Using over 40,000 scans of ten subjects, we learn how soft-tissue motion causes mesh triangles to deform relative to a base 3D body model. Our Dyna model uses a low-dimensional linear subspace to approximate soft-tissue deformation and relates the subspace coefficients to the changing pose of the body. Dyna uses a second-order auto-regressive model that predicts soft-tissue deformations based on previous deformations, the velocity and acceleration of the body, and the angular velocities and accelerations of the limbs. Dyna also models how deformations vary with a person's body mass index (BMI), producing different deformations for people with different shapes. Dyna realistically represents the dynamics of soft tissue for previously unseen subjects and motions. We provide tools for animators to modify the deformations and apply them to new stylized characters.}, bibtype = {article}, author = {Pons-Moll, Gerard and Romero, Javier and Mahmood, Naureen and Black, Michael J}, doi = {10.1145/2766993}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {ORB-SLAM: A Versatile and Accurate Monocular SLAM System}, type = {article}, year = {2015}, keywords = {Lifelong mapping,Simultaneous localization and mapping (SLAM),localization,monocular vision,recognition}, pages = {1147-1163}, volume = {31}, id = {da98398f-1333-34cf-b222-cb129c0f22dd}, created = {2022-09-13T08:14:28.116Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:33.029Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This paper presents ORB-SLAM, a feature-based monocular simultaneous localization and mapping (SLAM) system that operates in real time, in small and large indoor and outdoor environments. The system is robust to severe motion clutter, allows wide baseline loop closing and relocalization, and includes full automatic initialization. Building on excellent algorithms of recent years, we designed from scratch a novel system that uses the same features for all SLAM tasks: tracking, mapping, relocalization, and loop closing. A survival of the fittest strategy that selects the points and keyframes of the reconstruction leads to excellent robustness and generates a compact and trackable map that only grows if the scene content changes, allowing lifelong operation. We present an exhaustive evaluation in 27 sequences from the most popular datasets. ORB-SLAM achieves unprecedented performance with respect to other state-of-the-art monocular SLAM approaches. For the benefit of the community, we make the source code public.}, bibtype = {article}, author = {Mur-Artal, Raul and Montiel, J. M.M. and Tardos, Juan D.}, doi = {10.1109/TRO.2015.2463671}, journal = {IEEE Transactions on Robotics}, number = {5} }
@article{ title = {Dynamic 3D avatar creation from hand-held video input}, type = {article}, year = {2015}, keywords = {3D avatar creation,Blendshapes,Face animation,Rigging}, pages = {1-14}, volume = {34}, id = {e1d1b1c6-1ec5-30d0-ac47-635ccf04eca1}, created = {2023-05-03T13:16:40.622Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.616Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ichim2015}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present a complete pipeline for creating fully rigged, personalized 3D facial avatars from hand-held video. Our system faithfully recovers facial expression dynamics of the user by adapting a blendshape template to an image sequence of recorded expressions using an optimization that integrates feature tracking, optical flow, and shape from shading. Fine-scale details such as wrinkles are captured separately in normal maps and ambient occlusion maps. From this user- and expression-specific data, we learn a regressor for on-the-fly detail synthesis during animation to enhance the perceptual realism of the avatars. Our system demonstrates that the use of appropriate reconstruction priors yields compelling face rigs even with a minimalistic acquisition system and limited user assistance. This facilitates a range of new applications in computer animation and consumer-level online communication based on personalized avatars. We present realtime application demos to validate our method. Copyright is held by the owner/author(s).}, bibtype = {article}, author = {Ichim, Alexandru Eugen and Bouazizy, Sofien and Paulyz, Mark}, doi = {10.1145/2766974}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {Edge boxes: Locating object proposals from edges}, type = {article}, year = {2014}, keywords = {edge detection,object detection,object proposals}, pages = {391-405}, volume = {8693 LNCS}, id = {4eb6ed24-eed8-3f8c-a6f8-edeb8b03bbe2}, created = {2020-09-14T08:14:53.642Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-27T07:13:12.588Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, abstract = {The use of object proposals is an effective recent approach for increasing the computational efficiency of object detection. We propose a novel method for generating object bounding box proposals using edges. Edges provide a sparse yet informative representation of an image. Our main observation is that the number of contours that are wholly contained in a bounding box is indicative of the likelihood of the box containing an object. We propose a simple box objectness score that measures the number of edges that exist in the box minus those that are members of contours that overlap the box's boundary. Using efficient data structures, millions of candidate boxes can be evaluated in a fraction of a second, returning a ranked set of a few thousand top-scoring proposals. Using standard metrics, we show results that are significantly more accurate than the current state-of-the-art while being faster to compute. In particular, given just 1000 proposals we achieve over 96% object recall at overlap threshold of 0.5 and over 75% recall at the more challenging overlap of 0.7. Our approach runs in 0.25 seconds and we additionally demonstrate a near real-time variant with only minor loss in accuracy. © 2014 Springer International Publishing.}, bibtype = {article}, author = {Zitnick, C. Lawrence and Dollár, Piotr}, doi = {10.1007/978-3-319-10602-1_26}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 5} }
@article{ title = {Recognition of 3D package shapes for single camera metrology}, type = {article}, year = {2014}, pages = {99-106}, id = {cf333158-04ca-3712-8ccf-4fa2414a86e2}, created = {2020-09-14T08:14:53.720Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-25T14:53:35.837Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,a89f4866-a7e8-4ea9-aa98-e3f470892f7c,16688d52-1757-4ef4-badb-f53b700252a9}, private_publication = {false}, abstract = {Many applications of 3D object measurement have become commercially viable due to the recent availability of low-cost range cameras such as the Microsoft Kinect. We address the application of measuring an object's dimensions for the purpose of billing in shipping transactions, where high accuracy is required for certification. In particular, we address cases where an object's pose reduces the accuracy with which we can estimate dimensions from a single camera. Because the class of object shapes is limited in the shipping domain, we perform a closed-world recognition in order to determine a shape model which can account for missing parts, and/or to induce the user to reposition the object for higher accuracy. Our experiments demonstrate that the addition of this recognition step significantly improves system accuracy. © 2014 IEEE.}, bibtype = {article}, author = {Lloyd, Ryan and McCloskey, Scott}, doi = {10.1109/WACV.2014.6836113}, journal = {2014 IEEE Winter Conference on Applications of Computer Vision, WACV 2014} }
@article{ title = {SRA: Fast removal of general multipath for ToF sensors}, type = {article}, year = {2014}, pages = {234-249}, volume = {8689 LNCS}, id = {d94ebe35-e357-380e-869f-b80b40239be5}, created = {2020-10-22T06:36:09.074Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-23T05:18:21.255Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {A major issue with Time of Flight sensors is the presence of multipath interference. We present Sparse Reflections Analysis (SRA), an algorithm for removing this interference which has two main advantages. First, it allows for very general forms of multipath, including interference with three or more paths, diffuse multipath resulting from Lambertian surfaces, and combinations thereof. SRA removes this general multipath with robust techniques based on L1 optimization. Second, due to a novel dimension reduction, we are able to produce a very fast version of SRA, which is able to run at frame rate. Experimental results on both synthetic data with ground truth, as well as real images of challenging scenes, validate the approach. © 2014 Springer International Publishing.}, bibtype = {article}, author = {Freedman, Daniel and Smolin, Yoni and Krupka, Eyal and Leichter, Ido and Schmidt, Mirko}, doi = {10.1007/978-3-319-10590-1_16}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 1} }
@article{ title = {A quantitative evaluation of surface normal estimation in point clouds}, type = {article}, year = {2014}, pages = {4220-4226}, publisher = {IEEE}, id = {f0292532-eef3-3943-a1b1-1c22ddc5a066}, created = {2020-11-16T10:05:26.825Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:16.340Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We revisit a well-studied problem in the analysis of range data: surface normal estimation for a set of unorganized points. Surface normal estimation has been well-studied initially due to its theoretical appeal and more recently due to its many practical applications. The latter cover several aspects of range data analysis from plane or surface fitting to segmentation, object detection and scene analysis. Following the vast majority of the literature, we also focus our attention on techniques that operate in small neighborhoods around the point whose normal is to be estimated. We pay close attention to aspects of the implementation, such as the use of weights and normalization, that have not been studied in detail in the past. We perform quantitative evaluation on a diverse set of point clouds derived from 3D meshes, which allows us to obtain accurate ground truth.}, bibtype = {article}, author = {Jordan, Krzysztof and Mordohai, Philippos}, doi = {10.1109/IROS.2014.6943157}, journal = {IEEE International Conference on Intelligent Robots and Systems}, number = {Iros} }
@article{ title = {High quality photometric reconstruction using a depth camera}, type = {article}, year = {2014}, keywords = {3D reconstruction,Kinect,depth camera,fusion methods,photometric stereo}, pages = {2283-2290}, publisher = {IEEE}, id = {690de9bf-7674-39d2-a64c-9af0e0e4dc5f}, created = {2020-11-16T10:06:03.651Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-16T10:06:12.008Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19}, private_publication = {false}, abstract = {In this paper we present a depth-guided photometric 3D reconstruction method that works solely with a depth camera like the Kinect. Existing methods that fuse depth with normal estimates use an external RGB camera to obtain photometric information and treat the depth camera as a black box that provides a low quality depth estimate. Our contribution to such methods are two fold. Firstly, instead of using an extra RGB camera, we use the infra-red (IR) camera of the depth camera system itself to directly obtain high resolution photometric information. We believe that ours is the first method to use an IR depth camera system in this manner. Secondly, photometric methods applied to complex objects result in numerous holes in the reconstructed surface due to shadows and self-occlusions. To mitigate this problem, we develop a simple and effective multiview reconstruction approach that fuses depth and normal information from multiple viewpoints to build a complete, consistent and accurate 3D surface representation. We demonstrate the efficacy of our method to generate high quality 3D surface reconstructions for some complex 3D figurines.}, bibtype = {article}, author = {Haque, Sk Mohammadul and Chatterjee, Avishek and Govindu, Venu Madhav}, doi = {10.1109/CVPR.2014.292}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {A quantitative evaluation of surface normal estimation in point clouds}, type = {article}, year = {2014}, pages = {4220-4226}, id = {de469dd0-c4a8-331e-abf2-3d22d8377ba4}, created = {2021-01-26T07:00:24.995Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T14:09:30.723Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We revisit a well-studied problem in the analysis of range data: surface normal estimation for a set of unorganized points. Surface normal estimation has been well-studied initially due to its theoretical appeal and more recently due to its many practical applications. The latter cover several aspects of range data analysis from plane or surface fitting to segmentation, object detection and scene analysis. Following the vast majority of the literature, we also focus our attention on techniques that operate in small neighborhoods around the point whose normal is to be estimated. We pay close attention to aspects of the implementation, such as the use of weights and normalization, that have not been studied in detail in the past. We perform quantitative evaluation on a diverse set of point clouds derived from 3D meshes, which allows us to obtain accurate ground truth.}, bibtype = {article}, author = {Jordan, Krzysztof and Mordohai, Philippos}, doi = {10.1109/IROS.2014.6943157}, journal = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {Modeling and correction of multipath interference in time of flight cameras}, type = {article}, year = {2014}, keywords = {Iterative method,Multipath Interference (MpI),Time of Flight (ToF)}, pages = {1-13}, volume = {32}, websites = {http://dx.doi.org/10.1016/j.imavis.2013.10.008}, publisher = {Elsevier B.V.}, id = {75d70986-89ac-3bbe-a53c-7b610b1d8641}, created = {2021-01-26T07:28:57.432Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:52.586Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {Multipath interference of light is the cause of important errors in Time of Flight (ToF) depth estimation. This paper proposes an algorithm that removes multipath distortion from a single depth map obtained by a ToF camera. Our approach does not require information about the scene, apart from ToF measurements. The method is based on fitting ToF measurements with a radiometric model. Model inputs are depth values free from multipath interference whereas model outputs consist of synthesized ToF measurements. We propose an iterative optimization algorithm that obtains model parameters that best reproduce ToF measurements, recovering the depth of the scene without distortion. We show results with both synthetic and real scenes captured by commercial ToF sensors. In all cases, our algorithm accurately corrects the multipath distortion, obtaining depth maps that are very close to ground truth data. © 2013 Elsevier B.V.}, bibtype = {article}, author = {Jiménez, David and Pizarro, Daniel and Mazo, Manuel and Palazuelos, Sira}, doi = {10.1016/j.imavis.2013.10.008}, journal = {Image and Vision Computing}, number = {1} }
@article{ title = {A probabilistic framework for next best view estimation in a cluttered environment}, type = {article}, year = {2014}, keywords = {3-D perception,Cluttered environments,Missing points,Next best view estimation,Robot exploration,Sensor placement,Sensor planning,View planning}, pages = {148-164}, volume = {25}, websites = {http://dx.doi.org/10.1016/j.jvcir.2013.07.006}, publisher = {Elsevier Inc.}, id = {992d5ae7-b0c7-3578-86be-adce5b4445fc}, created = {2021-02-09T17:05:46.759Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:31.527Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {86ced166-c5ab-4f59-a018-08923425b037}, private_publication = {false}, abstract = {In this article, we present an information gain-based variant of the next best view problem for occluded environment. Our proposed method utilizes a belief model of the unobserved space to estimate the expected information gain of each possible viewpoint. More precise, this belief model allows a more precise estimation of the visibility of occluded space and with that a more accurate prediction of the potential information gain of new viewing positions. We present experimental evaluation on a robotic platform for active data acquisition, however due to the generality of our approach it also applies to a wide variety of 3D reconstruction problems. With the evaluation done in simulation and on a real robotic platform, exploring and acquiring data from different environments we demonstrate the generality and usefulness of our approach for next best view estimation and autonomous data acquisition. © 2013 Elsevier Inc. All rights reserved.}, bibtype = {article}, author = {Potthast, Christian and Sukhatme, Gaurav S.}, doi = {10.1016/j.jvcir.2013.07.006}, journal = {Journal of Visual Communication and Image Representation}, number = {1} }
@article{ title = {Quality-driven Poisson-guided Autoscanning}, type = {article}, year = {2014}, keywords = {3d acquisition,autonomous scanning,next-best-view,poisson surface reconstruction}, volume = {33}, id = {fa84f665-bb2f-3ac5-8ee0-8a1fa69a9dbe}, created = {2021-02-09T17:05:46.866Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:51.600Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {86ced166-c5ab-4f59-a018-08923425b037}, private_publication = {false}, bibtype = {article}, author = {Wu, Shihao and Cohen-or, Daniel and Deussen, Oliver and Chen, Baoquan} }
@article{ title = {Volumetric next-best-view planning for 3D object reconstruction with positioning error}, type = {article}, year = {2014}, keywords = {Next best view,Object reconstruction,Sensor planning,View planning}, volume = {11}, id = {e8a2a385-2ea2-3b0b-acea-1c113b6a3b6b}, created = {2021-02-09T17:05:46.873Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:34.596Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Vasquez-Gomez2014}, folder_uuids = {86ced166-c5ab-4f59-a018-08923425b037,5439d198-93d5-4603-a7ce-201d423f231e,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Three-dimensional (3D) object reconstruction is the process of building a 3D model of a real object. This task is performed by taking several scans of an object from different locations (views). Due to the limited field of view of the sensor and the object's self-occlusions, it is a difficult problem to solve. In addition, sensor positioning by robots is not perfect, making the actual view different from the expected one. We propose a next best view (NBV) algorithm that determines each view to reconstruct an arbitrary object. Furthermore, we propose a method to deal with the uncertainty in sensor positioning. The algorithm fulfills all the constraints of a reconstruction process, such as new information, positioning constraints, sensing constraints and registration constraints. Moreover, it improves the scan's quality and reduces the navigation distance. The algorithm is based on a search-based paradigm where a set of candidate views is generated and then each candidate view is evaluated to determine which one is the best. To deal with positioning uncertainty, we propose a second stage which re-evaluates the views according to their neighbours, such that the best view is that which is within a region of the good views. The results of simulation and comparisons with previous approaches are presented.}, bibtype = {article}, author = {Vasquez-Gomez, J. Irving and Sucar, L. Enrique and Murrieta-Cid, Rafael and Lopez-Damian, Efrain}, doi = {10.5772/58759}, journal = {International Journal of Advanced Robotic Systems} }
@article{ title = {Discriminatively trained dense surface normal estimation}, type = {article}, year = {2014}, pages = {468-484}, volume = {8693 LNCS}, id = {2c3dbefb-a056-3f91-9620-8a9468c484c5}, created = {2021-03-08T09:39:18.000Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-30T18:48:41.270Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Ladicky2014}, folder_uuids = {8d18e62e-6e66-4acb-ae6a-b470435041d8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this work we propose the method for a rather unexplored problem of computer vision - discriminatively trained dense surface normal estimation from a single image. Our method combines contextual and segment-based cues and builds a regressor in a boosting framework by transforming the problem into the regression of coefficients of a local coding. We apply our method to two challenging data sets containing images of man-made environments, the indoor NYU2 data set and the outdoor KITTI data set. Our surface normal predictor achieves results better than initially expected, significantly outperforming state-of-the-art. © 2014 Springer International Publishing.}, bibtype = {article}, author = {Ladický, L'ubor and Zeisl, Bernhard and Pollefeys, Marc}, doi = {10.1007/978-3-319-10602-1_31}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 5} }
@article{ title = {US008665267B2}, type = {article}, year = {2014}, volume = {2}, id = {33317607-4687-3c2b-a253-78935c44106a}, created = {2021-04-08T11:26:53.374Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T08:24:38.074Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Abbasinejad, Fatemeh and Jose, San and Data, Related U S Application}, number = {12} }
@article{ title = {DEPTH ENHANCEMENT USING RGB-D GUIDED FILTERING Tak-Wai Hui and King Ngi Ngan Department of Electronic Engineering , The Chinese University of Hong Kong}, type = {article}, year = {2014}, pages = {3832-3836}, id = {43d0f9dd-23d0-38bd-9e2f-8f952b32f411}, created = {2021-04-15T07:14:48.300Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:11.202Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, bibtype = {article}, author = {}, journal = {International Conference on Image Processing(ICIP)}, number = {2} }
@article{ title = {Volumetric next-best-view planning for 3D object reconstruction with positioning error}, type = {article}, year = {2014}, keywords = {Next best view,Object reconstruction,Sensor planning,View planning}, volume = {11}, id = {5fe1d2f5-bf76-3469-ad0b-048399fe32ea}, created = {2021-04-22T14:53:29.927Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-22T14:53:30.648Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {07e07de9-bcac-4934-a82b-d0aff540e56d}, private_publication = {false}, abstract = {Three-dimensional (3D) object reconstruction is the process of building a 3D model of a real object. This task is performed by taking several scans of an object from different locations (views). Due to the limited field of view of the sensor and the object's self-occlusions, it is a difficult problem to solve. In addition, sensor positioning by robots is not perfect, making the actual view different from the expected one. We propose a next best view (NBV) algorithm that determines each view to reconstruct an arbitrary object. Furthermore, we propose a method to deal with the uncertainty in sensor positioning. The algorithm fulfills all the constraints of a reconstruction process, such as new information, positioning constraints, sensing constraints and registration constraints. Moreover, it improves the scan's quality and reduces the navigation distance. The algorithm is based on a search-based paradigm where a set of candidate views is generated and then each candidate view is evaluated to determine which one is the best. To deal with positioning uncertainty, we propose a second stage which re-evaluates the views according to their neighbours, such that the best view is that which is within a region of the good views. The results of simulation and comparisons with previous approaches are presented.}, bibtype = {article}, author = {Vasquez-Gomez, J. Irving and Sucar, L. Enrique and Murrieta-Cid, Rafael and Lopez-Damian, Efrain}, doi = {10.5772/58759}, journal = {International Journal of Advanced Robotic Systems} }
@article{ title = {On the number of linear regions of deep neural networks}, type = {article}, year = {2014}, keywords = {Deep learning,Input space partition,Maxout,Neural network,Rectifier}, pages = {2924-2932}, volume = {4}, id = {6772942b-0c94-32a4-b5e1-414e52642b08}, created = {2021-07-12T14:15:34.894Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:31.920Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {We study the complexity of functions computable by deep feedforward neural networks with piecewise linear activations in terms of the symmetries and the number of linear regions that they have. Deep networks are able to sequentially map portions of each layer's input-space to the same output. In this way, deep models compute functions that react equally to complicated patterns of different inputs. The compositional structure of these functions enables them to re-use pieces of computation exponentially often in terms of the network's depth. This paper investigates the complexity of such compositional maps and contributes new theoretical results regarding the advantage of depth for neural networks with piecewise linear activation functions. In particular, our analysis is not specific to a single family of models, and as an example, we employ it for rectifier and maxout networks. We improve complexity bounds from pre-existing work and investigate the behavior of units in higher layer.}, bibtype = {article}, author = {Montúfar, Guido and Pascanu, Razvan and Cho, Kyunghyun and Bengio, Yoshua}, journal = {Advances in Neural Information Processing Systems}, number = {January} }
@article{ title = {Visualizing and understanding convolutional networks}, type = {article}, year = {2014}, pages = {818-833}, volume = {8689 LNCS}, id = {a09c3b67-8ee8-3e53-b3b9-350adaf60942}, created = {2021-07-12T14:15:34.898Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:40.321Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Large Convolutional Network models have recently demonstrated impressive classification performance on the ImageNet benchmark Krizhevsky et al. [18]. However there is no clear understanding of why they perform so well, or how they might be improved. In this paper we explore both issues. We introduce a novel visualization technique that gives insight into the function of intermediate feature layers and the operation of the classifier. Used in a diagnostic role, these visualizations allow us to find model architectures that outperform Krizhevsky et al on the ImageNet classification benchmark. We also perform an ablation study to discover the performance contribution from different model layers. We show our ImageNet model generalizes well to other datasets: when the softmax classifier is retrained, it convincingly beats the current state-of-the-art results on Caltech-101 and Caltech-256 datasets. © 2014 Springer International Publishing.}, bibtype = {article}, author = {Zeiler, Matthew D. and Fergus, Rob}, doi = {10.1007/978-3-319-10590-1_53}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 1} }
@article{ title = {Deep Directed Generative Autoencoders}, type = {article}, year = {2014}, pages = {1-10}, websites = {http://arxiv.org/abs/1410.0630}, id = {3b7094de-db7f-315b-a048-e34e37d4af03}, created = {2021-07-12T14:15:35.043Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:40.842Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {For discrete data, the likelihood $P(x)$ can be rewritten exactly and parametrized into $P(X = x) = P(X = x | H = f(x)) P(H = f(x))$ if $P(X | H)$ has enough capacity to put no probability mass on any $x'$ for which $f(x')\neq f(x)$, where $f(\cdot)$ is a deterministic discrete function. The log of the first factor gives rise to the log-likelihood reconstruction error of an autoencoder with $f(\cdot)$ as the encoder and $P(X|H)$ as the (probabilistic) decoder. The log of the second term can be seen as a regularizer on the encoded activations $h=f(x)$, e.g., as in sparse autoencoders. Both encoder and decoder can be represented by a deep neural network and trained to maximize the average of the optimal log-likelihood $\log p(x)$. The objective is to learn an encoder $f(\cdot)$ that maps $X$ to $f(X)$ that has a much simpler distribution than $X$ itself, estimated by $P(H)$. This "flattens the manifold" or concentrates probability mass in a smaller number of (relevant) dimensions over which the distribution factorizes. Generating samples from the model is straightforward using ancestral sampling. One challenge is that regular back-propagation cannot be used to obtain the gradient on the parameters of the encoder, but we find that using the straight-through estimator works well here. We also find that although optimizing a single level of such architecture may be difficult, much better results can be obtained by pre-training and stacking them, gradually transforming the data distribution into one that is more easily captured by a simple parametric model.}, bibtype = {article}, author = {Ozair, Sherjil and Bengio, Yoshua} }
@article{ title = {How to construct deep recurrent neural networks}, type = {article}, year = {2014}, pages = {1-13}, id = {d426ca1c-5870-3137-b5d9-b177fe5bfbf3}, created = {2021-07-12T14:15:35.237Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:09.989Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {In this paper, we explore different ways to extend a recurrent neural network (RNN) to a deep RNN. We start by arguing that the concept of depth in an RNN is not as clear as it is in feedforward neural networks. By carefully analyzing and understanding the architecture of an RNN, however, we find three points of an RNN which may be made deeper; (1) input-to-hidden function, (2) hidden-to-hidden transition and (3) hidden-to-output function. Based on this observation, we propose two novel architectures of a deep RNN which are orthogonal to an earlier attempt of stacking multiple recurrent layers to build a deep RNN (Schmidhuber, 1992; El Hihi and Bengio, 1996). We provide an alternative interpretation of these deep RNNs using a novel framework based on neural operators. The proposed deep RNNs are empirically evaluated on the tasks of polyphonic music prediction and language modeling. The experimental result supports our claim that the proposed deep RNNs benefit from the depth and outperform the conventional, shallow RNNs.}, bibtype = {article}, author = {Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Bengio, Yoshua}, journal = {2nd International Conference on Learning Representations, ICLR 2014 - Conference Track Proceedings} }
@article{ title = {Identifying and attacking the saddle point problem in high-dimensional non-convex optimization}, type = {article}, year = {2014}, pages = {2933-2941}, volume = {4}, id = {b4d9ac5d-56da-36fa-aec6-d073af59e173}, created = {2021-07-12T14:15:35.509Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:44.904Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {A central challenge to many fields of science and engineering involves minimizing non-convex error functions over continuous, high dimensional spaces. Gradient descent or quasi-Newton methods are almost ubiquitously used to perform such minimizations, and it is often thought that a main source of difficulty for these local methods to find the global minimum is the proliferation of local minima with much higher error than the global minimum. Here we argue, based on results from statistical physics, random matrix theory, neural network theory, and empirical evidence, that a deeper and more profound difficulty originates from the proliferation of saddle points, not local minima, especially in high dimensional problems of practical interest. Such saddle points are surrounded by high error plateaus that can dramatically slow down learning, and give the illusory impression of the existence of a local minimum. Motivated by these arguments, we propose a new approach to second-order optimization, the saddle-free Newton method, that can rapidly escape high dimensional saddle points, unlike gradient descent and quasi-Newton methods. Weapply this algorithm to deep or recurrent neural network training, and provide numerical evidence for its superior optimization performance.}, bibtype = {article}, author = {Dauphin, Yann N. and Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Ganguli, Surya and Bengio, Yoshua}, journal = {Advances in Neural Information Processing Systems}, number = {January} }
@article{ title = {Exact solutions to the nonlinear dynamics of learning in deep linear neural networks}, type = {article}, year = {2014}, pages = {1-22}, id = {81f33aff-0ef2-3965-bec8-d087d60d5a32}, created = {2021-07-12T14:15:35.679Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:43.580Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Despite the widespread practical success of deep learning methods, our theoretical understanding of the dynamics of learning in deep neural networks remains quite sparse. We attempt to bridge the gap between the theory and practice of deep learning by systematically analyzing learning dynamics for the restricted case of deep linear neural networks. Despite the linearity of their input-output map, such networks have nonlinear gradient descent dynamics on weights that change with the addition of each new hidden layer. We show that deep linear networks exhibit nonlinear learning phenomena similar to those seen in simulations of nonlinear networks, including long plateaus followed by rapid transitions to lower error solutions, and faster convergence from greedy unsupervised pretraining initial conditions than from random initial conditions. We provide an analytical description of these phenomena by finding new exact solutions to the nonlinear dynamics of deep learning. Our theoretical analysis also reveals the surprising finding that as the depth of a network approaches infinity, learning speed can nevertheless remain finite: for a special class of initial conditions on the weights, very deep networks incur only a finite, depth independent, delay in learning speed relative to shallow networks. We show that, under certain conditions on the training data, unsupervised pretraining can find this special class of initial conditions, while scaled random Gaussian initializations cannot. We further exhibit a new class of random orthogonal initial conditions on weights that, like unsupervised pre-training, enjoys depth independent learning times. We further show that these initial conditions also lead to faithful propagation of gradients even in deep nonlinear networks, as long as they operate in a special regime known as the edge of chaos.}, bibtype = {article}, author = {Saxe, Andrew M. and McClelland, James L. and Ganguli, Surya}, journal = {2nd International Conference on Learning Representations, ICLR 2014 - Conference Track Proceedings} }
@article{ title = {Siraj Ali - Reference Questionnaire.pdf}, type = {article}, year = {2014}, pages = {1026-1034}, id = {e29ebada-517e-376c-ad4f-1c622ddbcedc}, created = {2021-07-12T14:15:35.878Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:56.676Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Rectified activation units (rectifiers) are essential for state-of-the-art neural networks. In this work, we study rectifier neural networks for image classification from two aspects. First, we propose a Parametric Rectified Linear Unit (PReLU) that generalizes the traditional rectified unit. PReLU improves model fitting with nearly zero extra computational cost and little overfitting risk. Second, we derive a robust initialization method that particularly considers the rectifier nonlinearities. This method enables us to train extremely deep rectified models directly from scratch and to investigate deeper or wider network architectures. Based on our PReLU networks (PReLU-nets), we achieve 4.94% top-5 test error on the ImageNet 2012 classification dataset. This is a 26% relative improvement over the ILSVRC 2014 winner (GoogLeNet, 6.66% [29]). To our knowledge, our result is the first to surpass human-level performance (5.1%, [22]) on this visual recognition challenge.}, bibtype = {article}, author = {He, Kaiming}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Spectral networks and deep locally connected networks on graphs}, type = {article}, year = {2014}, pages = {1-14}, id = {41b36570-8ca5-37eb-952a-c7cdbba52443}, created = {2021-08-04T09:51:19.993Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T13:25:20.827Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bruna2014}, private_publication = {false}, abstract = {Convolutional Neural Networks are extremely efficient architectures in image and audio recognition tasks, thanks to their ability to exploit the local translational invariance of signal classes over their domain. In this paper we consider possible generalizations of CNNs to signals defined on more general domains without the action of a translation group. In particular, we propose two constructions, one based upon a hierarchical clustering of the domain, and another based on the spectrum of the graph Laplacian. We show through experiments that for low-dimensional graphs it is possible to learn convolutional layers with a number of parameters independent of the input size, resulting in efficient deep architectures.}, bibtype = {article}, author = {Bruna, Joan and Zaremba, Wojciech and Szlam, Arthur and LeCun, Yann}, journal = {2nd International Conference on Learning Representations, ICLR 2014 - Conference Track Proceedings} }
@article{ title = {Hyperspherical Variational Auto-Encoders}, type = {article}, year = {2014}, id = {030e1411-9e2a-323d-a46c-920e182bc42c}, created = {2021-08-30T18:48:39.129Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.168Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Auto-encoders2014}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Auto-encoders, Hyperspherical Variational and Davidson, Tim R and Falorsi, Luca and Cao, Nicola De and Kipf, Thomas} }
@article{ title = {Stochastic backpropagation and approximate inference in deep generative models}, type = {article}, year = {2014}, pages = {3057-3070}, volume = {4}, id = {dc6ac099-8b9f-3f71-a5fd-6ceb0f5b34f5}, created = {2021-09-09T14:35:21.128Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.539Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Rezende2014}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {We marry ideas from deep neural networks and approximate Bayesian inference to derive a gen-eralised class of deep, directed generative models, endowed with a new algorithm for scalable inference and learning. Our algorithm introduces a recognition model to represent an approximate posterior distribution and uses this for optimisation of a variational lower bound. We develop stochastic backpropagation - rules for gradient backpropagation through stochastic variables - And derive an algorithm that allows for joint optimisation of the parameters of both the generative and recognition models. We demonstrate on several real-world data sets that by using stochastic backpropagation and variational inference, we obtain models that are able to generate realistic samples of data, allow for accurate imputations of missing data, and provide a useful tool for high-dimensional data visualisation.}, bibtype = {article}, author = {Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan}, journal = {31st International Conference on Machine Learning, ICML 2014} }
@article{ title = {Semi-supervised learning with deep generative models}, type = {article}, year = {2014}, pages = {3581-3589}, volume = {4}, id = {7863cc34-980a-36aa-8ed2-f13a21e537d3}, created = {2021-09-09T14:35:21.228Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.375Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kingma2014}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {The ever-increasing size of modern data sets combined with the difficulty of obtaining label information has made semi-supervised learning one of the problems of significant practical importance in modern data analysis. We revisit the approach to semi-supervised learning with generative models and develop new models that allow for effective generalisation from small labelled data sets to large unlabelled ones. Generative approaches have thus far been either inflexible, inefficient or non-scalable. We show that deep generative models and approximate Bayesian inference exploiting recent advances in variational methods can be used to provide significant improvements, making generative approaches highly competitive for semi-supervised learning.}, bibtype = {article}, author = {Kingma, Diederik P. and Rezende, Danilo J. and Mohamed, Shakir and Welling, Max}, journal = {Advances in Neural Information Processing Systems}, number = {January} }
@article{ title = {Rényi Divergence and Kullback – Leibler Divergence}, type = {article}, year = {2014}, pages = {3797-3820}, volume = {60}, publisher = {IEEE}, id = {3ef66108-49db-36f8-a4e5-cbe7d9438bdd}, created = {2021-09-21T07:33:07.673Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.107Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Erven2014}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Erven, Tim Van and Harremoës, Peter}, journal = {Ieee Transactions on Information Theory}, number = {7} }
@article{ title = {Spatial pyramid pooling in deep convolutional networks for visual recognition}, type = {article}, year = {2014}, pages = {346-361}, volume = {8691 LNCS}, id = {97047425-7e0a-3473-a2a7-99d3ff6a77f2}, created = {2021-11-01T10:14:38.921Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.334Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {He2014}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {Existing deep convolutional neural networks (CNNs) require a fixed-size (e.g. 224×224) input image. This requirement is "artificial" and may hurt the recognition accuracy for the images or sub-images of an arbitrary size/scale. In this work, we equip the networks with a more principled pooling strategy, "spatial pyramid pooling", to eliminate the above requirement. The new network structure, called SPP-net, can generate a fixed-length representation regardless of image size/scale. By removing the fixed-size limitation, we can improve all CNN-based image classification methods in general. Our SPP-net achieves state-of-the-art accuracy on the datasets of ImageNet 2012, Pascal VOC 2007, and Caltech101. The power of SPP-net is more significant in object detection. Using SPP-net, we compute the feature maps from the entire image only once, and then pool features in arbitrary regions (sub-images) to generate fixed-length representations for training the detectors. This method avoids repeatedly computing the convolutional features. In processing test images, our method computes convolutional features 30-170× faster than the recent leading method R-CNN (and 24-64× faster overall), while achieving better or comparable accuracy on Pascal VOC 2007. © 2014 Springer International Publishing.}, bibtype = {article}, author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, doi = {10.1007/978-3-319-10578-9_23}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 3} }
@article{ title = {SHOT: Unique signatures of histograms for surface and texture description}, type = {article}, year = {2014}, keywords = {3D descriptors,3D reconstruction,Object recognition,Surface matching}, pages = {251-264}, volume = {125}, publisher = {Academic Press Inc.}, id = {ae6759df-e2b0-3718-a147-3ce92b18b41a}, created = {2022-02-15T12:27:36.555Z}, accessed = {2022-02-15}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.184Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Salti2014}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {This paper presents a local 3D descriptor for surface matching dubbed SHOT. Our proposal stems from a taxonomy of existing methods which highlights two major approaches, referred to as Signatures and Histograms, inherently emphasizing descriptiveness and robustness respectively. We formulate a comprehensive proposal which encompasses a repeatable local reference frame as well as a 3D descriptor, the latter featuring an hybrid structure between Signatures and Histograms so as to aim at a more favorable balance between descriptive power and robustness. A quite peculiar trait of our method concerns seamless integration of multiple cues within the descriptor to improve distinctiveness, which is particularly relevant nowadays due to the increasing availability of affordable RGB-D sensors which can gather both depth and color information. A thorough experimental evaluation based on datasets acquired with different types of sensors, including a novel RGB-D dataset, vouches that SHOT outperforms state-of-the-art local descriptors in experiments addressing descriptor matching for object recognition, 3D reconstruction and shape retrieval. © 2014 Elsevier Inc. All rights reserved.}, bibtype = {article}, author = {Salti, Samuele and Tombari, Federico and Di Stefano, Luigi}, doi = {10.1016/J.CVIU.2014.04.011}, journal = {Computer Vision and Image Understanding} }
@article{ title = {Object detection and classification from large-scale cluttered indoor scans}, type = {article}, year = {2014}, pages = {11-21}, volume = {33}, id = {f83ea980-711b-3e40-8e94-def5a3a46dee}, created = {2022-03-11T09:36:29.878Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-18T10:02:57.557Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1e7b477c-c241-48c3-a542-ad06e3d39dd5}, private_publication = {false}, abstract = {We present a method to automatically segment indoor scenes by detecting repeated objects. Our algorithm scales to datasets with 198 million points and does not require any training data. We propose a trivially parallelizable preprocessing step, which compresses a point cloud into a collection of nearly-planar patches related by geometric transformations. This representation enables us to robustly filter out noise and greatly reduces the computational cost and memory requirements of our method, enabling execution at interactive rates. We propose a patch similarity measure based on shape descriptors and spatial configurations of neighboring patches. The patches are clustered in a Euclidean embedding space based on the similarity matrix to yield the segmentation of the input point cloud. The generated segmentation can be used to compress the raw point cloud, create an object database, and increase the clarity of the point cloud visualization. © 2014 The Author(s) Computer Graphics Forum © 2014 The Eurographics Association and John Wiley & Sons Ltd. Published by John Wiley & Sons Ltd.}, bibtype = {article}, author = {Mattausch, Oliver and Panozzo, Daniele and Mura, Claudio and Sorkine-Hornung, Olga and Pajarola, Renato}, doi = {10.1111/cgf.12286}, journal = {Computer Graphics Forum}, number = {2} }
@inproceedings{ title = {DeepWalk: online learning of social representations}, type = {inproceedings}, year = {2014}, keywords = {deep learning,latent representations,learning with partial labels,network classification,online learning,social networks}, pages = {701-710}, websites = {https://doi.org/10.1145/2623330.2623732}, month = {8}, publisher = {Association for Computing Machinery}, city = {New York, NY, USA}, series = {KDD '14}, id = {a1b11214-4b27-359e-b7dc-0d26786e45bd}, created = {2022-03-28T09:45:00.821Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:21.751Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {perozziDeepWalkOnlineLearning2014}, source_type = {inproceedings}, short_title = {DeepWalk}, private_publication = {false}, abstract = {We present DeepWalk, a novel approach for learning latent representations of vertices in a network. These latent representations encode social relations in a continuous vector space, which is easily exploited by statistical models. DeepWalk generalizes recent advancements in language modeling and unsupervised feature learning (or deep learning) from sequences of words to graphs. DeepWalk uses local information obtained from truncated random walks to learn latent representations by treating walks as the equivalent of sentences. We demonstrate DeepWalk's latent representations on several multi-label network classification tasks for social networks such as BlogCatalog, Flickr, and YouTube. Our results show that DeepWalk outperforms challenging baselines which are allowed a global view of the network, especially in the presence of missing information. DeepWalk's representations can provide F1 scores up to 10\% higher than competing methods when labeled data is sparse. In some experiments, DeepWalk's representations are able to outperform all baseline methods while using 60\% less training data. DeepWalk is also scalable. It is an online learning algorithm which builds useful incremental results, and is trivially parallelizable. These qualities make it suitable for a broad class of real world applications such as network classification, and anomaly detection.}, bibtype = {inproceedings}, author = {Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven}, doi = {10.1145/2623330.2623732}, booktitle = {Proceedings of the 20th ACM SIGKDD international conference on Knowledge discovery and data mining} }
@inproceedings{ title = {Generative Adversarial Nets}, type = {inproceedings}, year = {2014}, volume = {27}, websites = {https://proceedings.neurips.cc/paper/2014/hash/5ca3e9b122f61f8f06494c97b1afccf3-Abstract.html}, publisher = {Curran Associates, Inc.}, id = {c29d6ca4-fbe1-3a79-a5d8-ad6f42e190f3}, created = {2022-03-28T09:45:01.550Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:23.132Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {goodfellowGenerativeAdversarialNets2014}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {Representation Learning: A Review and New Perspectives}, type = {article}, year = {2014}, keywords = {Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1206.5538}, month = {4}, id = {d71022e1-aa8f-3b2a-bd1b-e7e63fe0b3b6}, created = {2022-03-28T09:45:01.609Z}, accessed = {2022-03-22}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:13.653Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bengioRepresentationLearningReview2014}, source_type = {article}, short_title = {Representation Learning}, notes = {arXiv: 1206.5538}, private_publication = {false}, abstract = {The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, auto-encoders, manifold learning, and deep networks. This motivates longer-term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation and manifold learning.}, bibtype = {article}, author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal}, journal = {arXiv:1206.5538 [cs]} }
@article{ title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}, type = {article}, year = {2014}, keywords = {3D human pose estimation,Cameras,Estimation,Fourier kernel approximations,Joints,Modeling and recovery of physical attributes,Motion,Sensors,Solid modeling,Three-dimensional displays,Training,articulated body modeling,human motion capture data,large-scale learning,optimization,structured prediction}, pages = {1325-1339}, volume = {36}, month = {7}, id = {5c22c9ee-ff8a-3f65-8f2e-b0e226821844}, created = {2022-03-28T09:45:02.757Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:24.355Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {ionescuHuman36MLarge2014}, source_type = {article}, short_title = {Human3.6M}, notes = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence}, private_publication = {false}, abstract = {We introduce a new dataset, Human3.6M, of 3.6 Million accurate 3D Human poses, acquired by recording the performance of 5 female and 6 male subjects, under 4 different viewpoints, for training realistic human sensing systems and for evaluating the next generation of human pose estimation models and algorithms. Besides increasing the size of the datasets in the current state-of-the-art by several orders of magnitude, we also aim to complement such datasets with a diverse set of motions and poses encountered as part of typical human activities (taking photos, talking on the phone, posing, greeting, eating, etc.), with additional synchronized image, human motion capture, and time of flight (depth) data, and with accurate 3D body scans of all the subject actors involved. We also provide controlled mixed reality evaluation scenarios where 3D human models are animated using motion capture and inserted using correct 3D geometry, in complex real environments, viewed with moving cameras, and under occlusion. Finally, we provide a set of large-scale statistical models and detailed evaluation baselines for the dataset illustrating its diversity and the scope for improvement by future work in the research community. Our experiments show that our best large-scale model can leverage our full training set to obtain a 20\% improvement in performance compared to a training set of the scale of the largest existing public dataset for this problem. Yet the potential for improvement by leveraging higher capacity, more complex models with our large dataset, is substantially vaster and should stimulate future research. The dataset together with code for the associated large-scale learning models, features, visualization tools, as well as the evaluation server, is available online at http://vision.imar.ro/human3.6m.}, bibtype = {article}, author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian}, doi = {10.1109/TPAMI.2013.248}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {7} }
@inproceedings{ title = {Beyond PASCAL: A benchmark for 3D object detection in the wild}, type = {inproceedings}, year = {2014}, keywords = {Abstracts,Bicycles,Boats,Design automation,Motorcycles,Solid modeling}, pages = {75-82}, month = {3}, id = {ed20bea1-5f0e-373a-a1fe-382359438794}, created = {2022-03-28T09:45:03.175Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:33.879Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {xiangPASCALBenchmark3D2014}, source_type = {inproceedings}, short_title = {Beyond PASCAL}, notes = {ISSN: 1550-5790}, private_publication = {false}, abstract = {3D object detection and pose estimation methods have become popular in recent years since they can handle ambiguities in 2D images and also provide a richer description for objects compared to 2D object detectors. However, most of the datasets for 3D recognition are limited to a small amount of images per category or are captured in controlled environments. In this paper, we contribute PASCAL3D+ dataset, which is a novel and challenging dataset for 3D object detection and pose estimation. PASCAL3D+ augments 12 rigid categories of the PASCAL VOC 2012 [4] with 3D annotations. Furthermore, more images are added for each category from ImageNet [3]. PASCAL3D+ images exhibit much more variability compared to the existing 3D datasets, and on average there are more than 3,000 object instances per category. We believe this dataset will provide a rich testbed to study 3D detection and pose estimation and will help to significantly push forward research in this area. We provide the results of variations of DPM [6] on our new dataset for object detection and viewpoint estimation in different scenarios, which can be used as baselines for the community. Our benchmark is available online at http://cvgl.stanford.edu/projects/pascal3d.}, bibtype = {inproceedings}, author = {Xiang, Yu and Mottaghi, Roozbeh and Savarese, Silvio}, doi = {10.1109/WACV.2014.6836101}, booktitle = {IEEE Winter Conference on Applications of Computer Vision} }
@inproceedings{ title = {Deep AutoRegressive Networks}, type = {inproceedings}, year = {2014}, pages = {1242-1250}, websites = {https://proceedings.mlr.press/v32/gregor14.html}, month = {6}, publisher = {PMLR}, id = {7d48132b-cfac-3cc2-9d92-fd0097c8d793}, created = {2022-03-28T09:45:03.562Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:30.091Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gregorDeepAutoRegressiveNetworks2014a}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {We introduce a deep, generative autoencoder capable of learning hierarchies of distributed representations from data. Successive deep stochastic hidden layers are equipped with autoregressive connections, which enable the model to be sampled from quickly and exactly via ancestral sampling. We derive an efficient approximate parameter estimation method based on the minimum description length (MDL) principle, which can be seen as maximising a variational lower bound on the log-likelihood, with a feedforward neural network implementing approximate inference. We demonstrate state-of-the-art generative performance on a number of classic data sets: several UCI data sets, MNIST and Atari 2600 games.}, bibtype = {inproceedings}, author = {Gregor, Karol and Danihelka, Ivo and Mnih, Andriy and Blundell, Charles and Wierstra, Daan}, booktitle = {Proceedings of the 31st International Conference on Machine Learning} }
@inproceedings{ title = {Deep Generative Stochastic Networks Trainable by Backprop}, type = {inproceedings}, year = {2014}, pages = {226-234}, websites = {https://proceedings.mlr.press/v32/bengio14.html}, month = {6}, publisher = {PMLR}, id = {f0cfa1c7-45de-32f1-8581-cecca69cda5f}, created = {2022-03-28T09:45:04.285Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:26.521Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bengioDeepGenerativeStochastic2014}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {We introduce a novel training principle for probabilistic models that is an alternative to maximum likelihood. The proposed Generative Stochastic Networks (GSN) framework is based on learning the transition operator of a Markov chain whose stationary distribution estimates the data distribution. Because the transition distribution is a conditional distribution generally involving a small move, it has fewer dominant modes, being unimodal in the limit of small moves. Thus, it is easier to learn, more like learning to perform supervised function approximation, with gradients that can be obtained by backprop. The theorems provided here generalize recent work on the probabilistic interpretation of denoising autoencoders and provide an interesting justification for dependency networks and generalized pseudolikelihood (along with defining an appropriate joint distribution and sampling mechanism, even when the conditionals are not consistent). GSNs can be used with missing inputs and can be used to sample subsets of variables given the rest. Successful experiments are conducted, validating these theoretical results, on two image datasets and with a particular architecture that mimics the Deep Boltzmann Machine Gibbs sampler but allows training to proceed with backprop, without the need for layerwise pretraining.}, bibtype = {inproceedings}, author = {Bengio, Yoshua and Laufer, Eric and Alain, Guillaume and Yosinski, Jason}, booktitle = {Proceedings of the 31st International Conference on Machine Learning} }
@inproceedings{ title = {FAUST: Dataset and Evaluation for 3D Mesh Registration}, type = {inproceedings}, year = {2014}, pages = {3794-3801}, websites = {https://www.cv-foundation.org/openaccess/content_cvpr_2014/html/Bogo_FAUST_Dataset_and_2014_CVPR_paper.html}, id = {4b894ff1-10a7-3bc9-80d0-18f56fe15fdd}, created = {2022-03-28T09:45:04.835Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:01.088Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bogoFAUSTDatasetEvaluation2014}, source_type = {inproceedings}, short_title = {FAUST}, private_publication = {false}, bibtype = {inproceedings}, author = {Bogo, Federica and Romero, Javier and Loper, Matthew and Black, Michael J} }
@article{ title = {A Survey on Procedural Modelling for Virtual Worlds}, type = {article}, year = {2014}, keywords = {Computer Graphics I3.5: Computational Geometry and,languages,procedural content generation,procedural modeling methods,systems,virtual worlds}, pages = {31-50}, volume = {33}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.12276}, id = {9ddd9845-2043-32c4-97f9-7f5b643397ec}, created = {2022-03-28T09:45:04.974Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:10:14.793Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {smelikSurveyProceduralModelling2014}, source_type = {article}, notes = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.12276}, private_publication = {false}, abstract = {Procedural modelling deals with (semi-)automatic content generation by means of a program or procedure. Among other advantages, its data compression and the potential to generate a large variety of detailed content with reduced human intervention, have made procedural modelling attractive for creating virtual environments increasingly used in movies, games and simulations. We survey procedural methods that are useful to generate features of virtual worlds, including terrains, vegetation, rivers, roads, buildings and entire cities. In this survey, we focus particularly on the degree of intuitive control and of interactivity offered by each procedural method, because these properties are instrumental for their typical users: designers and artists. We identify the most promising research results that have been recently achieved, but we also realize that there is far from widespread acceptance of procedural methods among non-technical, creative professionals. We conclude by discussing some of the most important challenges of procedural modelling.}, bibtype = {article}, author = {Smelik, Ruben M and Tutenel, Tim and Bidarra, Rafael and Benes, Bedrich}, doi = {10.1111/cgf.12276}, journal = {Computer Graphics Forum}, number = {6} }
@inproceedings{ title = {Generalized Autoencoder: A Neural Network Framework for Dimensionality Reduction}, type = {inproceedings}, year = {2014}, pages = {490-497}, websites = {https://www.cv-foundation.org/openaccess/content_cvpr_workshops_2014/W15/html/Wang_Generalized_Autoencoder_A_2014_CVPR_paper.html}, id = {c6593de3-453c-377f-860f-1394a0151844}, created = {2022-03-28T09:45:05.577Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:15.335Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wangGeneralizedAutoencoderNeural2014}, source_type = {inproceedings}, short_title = {Generalized Autoencoder}, private_publication = {false}, bibtype = {inproceedings}, author = {Wang, Wei and Huang, Yan and Wang, Yizhou and Wang, Liang} }
@article{ title = {Auto-Encoding Variational Bayes}, type = {article}, year = {2014}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, websites = {http://arxiv.org/abs/1312.6114}, month = {5}, id = {8bd3dccc-9ea6-34de-905a-85e814d02cc8}, created = {2022-03-28T09:45:06.063Z}, accessed = {2021-09-14}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:22.235Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kingmaAutoEncodingVariationalBayes2014}, source_type = {article}, notes = {arXiv: 1312.6114}, private_publication = {false}, abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions is two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.}, bibtype = {article}, author = {Kingma, Diederik P and Welling, Max}, journal = {arXiv:1312.6114 [cs, stat]} }
@inproceedings{ title = {A Hierarchical Representation for Future Action Prediction}, type = {inproceedings}, year = {2014}, keywords = {Dynamic Time Warping,Future Action,Human Movement,Motion Segment,Video Clip}, pages = {689-704}, publisher = {Springer International Publishing}, city = {Cham}, series = {Lecture Notes in Computer Science}, id = {d4d2b2fd-ea4f-39c2-80e5-cf6490346683}, created = {2022-03-28T09:45:06.257Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:15.589Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {lanHierarchicalRepresentationFuture2014}, source_type = {inproceedings}, private_publication = {false}, abstract = {We consider inferring the future actions of people from a still image or a short video clip. Predicting future actions before they are actually executed is a critical ingredient for enabling us to effectively interact with other humans on a daily basis. However, challenges are two fold: First, we need to capture the subtle details inherent in human movements that may imply a future action; second, predictions usually should be carried out as quickly as possible in the social world, when limited prior observations are available.In this paper, we propose hierarchical movemes - a new representation to describe human movements at multiple levels of granularities, ranging from atomic movements (e.g. an open arm) to coarser movements that cover a larger temporal extent. We develop a max-margin learning framework for future action prediction, integrating a collection of moveme detectors in a hierarchical way. We validate our method on two publicly available datasets and show that it achieves very promising performance.}, bibtype = {inproceedings}, author = {Lan, Tian and Chen, Tsung-Chuan and Savarese, Silvio}, editor = {Fleet, David and Pajdla, Tomas and Schiele, Bernt and Tuytelaars, Tinne}, doi = {10.1007/978-3-319-10578-9_45}, booktitle = {Computer Vision – ECCV 2014} }
@article{ title = {Rényi Divergence and Kullback-Leibler Divergence}, type = {article}, year = {2014}, keywords = {Bhattacharyya distance,Convergence,Data processing,Entropy,Kullback-Leibler divergence,Markov processes,Pythagorean inequality,Q measurement,Rényi divergence,Testing,information divergence,\textbackslash(\textbackslashalpha\textbacksl}, pages = {3797-3820}, volume = {60}, month = {7}, id = {0151cb2f-7939-37f9-8d5c-bfe462f5ad8f}, created = {2022-03-28T09:45:06.829Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:04.256Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {vanervenRenyiDivergenceKullbackLeibler2014}, source_type = {article}, notes = {Conference Name: IEEE Transactions on Information Theory}, private_publication = {false}, abstract = {Rényi divergence is related to Rényi entropy much like Kullback-Leibler divergence is related to Shannon's entropy, and comes up in many settings. It was introduced by Rényi as a measure of information that satisfies almost the same axioms as Kullback-Leibler divergence, and depends on a parameter that is called its order. In particular, the Rényi divergence of order 1 equals the Kullback-Leibler divergence. We review and extend the most important properties of Rényi divergence and Kullback- Leibler divergence, including convexity, continuity, limits of σ-algebras, and the relation of the special order 0 to the Gaussian dichotomy and contiguity. We also show how to generalize the Pythagorean inequality to orders different from 1, and we extend the known equivalence between channel capacity and minimax redundancy to continuous channel inputs (for all orders) and present several other minimax results.}, bibtype = {article}, author = {van Erven, Tim and Harremos, Peter}, doi = {10.1109/TIT.2014.2320500}, journal = {IEEE Transactions on Information Theory}, number = {7} }
@article{ title = {Quality relevant nonlinear batch process performance monitoring using a kernel based multiway non-Gaussian latent subspace projection approach}, type = {article}, year = {2014}, keywords = {Multidimensional mutual information,Non-Gaussian latent subspace projection,Nonlinear batch process,Nonlinear kernel feature space,Quality relevant batch process monitoring}, pages = {57-71}, volume = {24}, websites = {http://dx.doi.org/10.1016/j.jprocont.2013.10.017}, publisher = {Elsevier Ltd}, id = {6d99b0cb-eeeb-3515-ae97-8c296b96bcfc}, created = {2022-04-05T05:35:08.111Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-05T05:35:08.111Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Multiway kernel partial least squares method (MKPLS) has recently been developed for monitoring the operational performance of nonlinear batch or semi-batch processes. It has strong capability to handle batch trajectories and nonlinear process dynamics, which cannot be effectively dealt with by traditional multiway partial least squares (MPLS) technique. However, MKPLS method may not be effective in capturing significant non-Gaussian features of batch processes because only the second-order statistics instead of higher-order statistics are taken into account in the underlying model. On the other hand, multiway kernel independent component analysis (MKICA) has been proposed for nonlinear batch process monitoring and fault detection. Different from MKPLS, MKICA can extract not only nonlinear but also non-Gaussian features through maximizing the higher-order statistic of negentropy instead of second-order statistic of covariance within the high-dimensional kernel space. Nevertheless, MKICA based process monitoring approaches may not be well suited in many batch processes because only process measurement variables are utilized while quality variables are not considered in the multivariate models. In this paper, a novel multiway kernel based quality relevant non-Gaussian latent subspace projection (MKQNGLSP) approach is proposed in order to monitor the operational performance of batch processes with nonlinear and non-Gaussian dynamics by combining measurement and quality variables. First, both process measurement and quality variables are projected onto high-dimensional nonlinear kernel feature spaces, respectively. Then, the multidimensional latent directions within kernel feature subspaces corresponding to measurement and quality variables are concurrently searched for so that the maximized mutual information between the measurement and quality spaces is obtained. The I2 and SPE monitoring indices within the extracted latent subspaces are further defined to capture batch process faults resulting in abnormal product quality. The proposed MKQNGLSP method is applied to a fed-batch penicillin fermentation process and the operational performance monitoring results demonstrate the superiority of the developed method as apposed to the MKPLS based process monitoring approach. © 2013 Elsevier Ltd.}, bibtype = {article}, author = {Mori, Junichi and Yu, Jie}, doi = {10.1016/j.jprocont.2013.10.017}, journal = {Journal of Process Control}, number = {1} }
@article{ title = {Conditional Generative Adversarial Nets}, type = {article}, year = {2014}, pages = {1-7}, websites = {http://arxiv.org/abs/1411.1784}, id = {a7f74767-cd09-3b6f-9485-9977902a7925}, created = {2022-09-08T10:49:09.764Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:11.839Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {034ae31f-a548-45de-8507-3cbbc9e326ad,7c1a14c1-2416-47a6-b987-cb497a6abb50}, private_publication = {false}, abstract = {Generative Adversarial Nets [8] were recently introduced as a novel way to train generative models. In this work we introduce the conditional version of generative adversarial nets, which can be constructed by simply feeding the data, y, we wish to condition on to both the generator and discriminator. We show that this model can generate MNIST digits conditioned on class labels. We also illustrate how this model could be used to learn a multi-modal model, and provide preliminary examples of an application to image tagging in which we demonstrate how this approach can generate descriptive tags which are not part of training labels.}, bibtype = {article}, author = {Mirza, Mehdi and Osindero, Simon} }
@article{ title = {Generative adversarial networks}, type = {article}, year = {2014}, pages = {139-144}, volume = {63}, id = {4f4ac255-d56e-3992-a9c6-3e9ec19d8002}, created = {2022-09-08T10:49:09.890Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:11.642Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {034ae31f-a548-45de-8507-3cbbc9e326ad,7c1a14c1-2416-47a6-b987-cb497a6abb50}, private_publication = {false}, abstract = {Generative adversarial networks are a kind of artificial intelligence algorithm designed to solve the generative modeling problem. The goal of a generative model is to study a collection of training examples and learn the probability distribution that generated them. Generative Adversarial Networks (GANs) are then able to generate more examples from the estimated probability distribution. Generative models based on deep learning are common, but GANs are among the most successful generative models (especially in terms of their ability to generate realistic high-resolution images). GANs have been successfully applied to a wide variety of tasks (mostly in research settings) but continue to present unique challenges and research opportunities because they are based on game theory while most other approaches to generative modeling are based on optimization.}, bibtype = {article}, author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, doi = {10.1145/3422622}, journal = {Communications of the ACM}, number = {11} }
@article{ title = {Grasp moduli spaces and spherical harmonics}, type = {article}, year = {2014}, pages = {389-396}, publisher = {IEEE}, id = {b4c55d30-ab78-3eab-9703-5be4dc1c59bf}, created = {2023-04-24T07:38:01.266Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:55.500Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Pokorny2014}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {In this work, we present a novel representation which enables a robot to reason about, transfer and optimize grasps on various objects by representing objects and grasps on them jointly in a common space. In our approach, objects are parametrized using smooth differentiable functions which are obtained from point cloud data via a spectral analysis. We show how, starting with point cloud data of various objects, one can utilize this space consisting of grasps and smooth surfaces in order to continuously deform various surface/grasp configurations with the goal of synthesizing force closed grasps on novel objects. We illustrate the resulting shape space for a collection of real world objects using multidimensional scaling and show that our formulation naturally enables us to use gradient ascent approaches to optimize and simultaneously deform a grasp from a known object towards a novel object.}, bibtype = {article}, author = {Pokorny, Florian T. and Bekiroglu, Yasemin and Kragic, Danica}, doi = {10.1109/ICRA.2014.6906886}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {Point cloud encoding for 3D building model retrieval}, type = {article}, year = {2014}, keywords = {3D model retrieval,Cyber city modeling,point cloud encoding}, pages = {337-345}, volume = {16}, publisher = {IEEE}, id = {002f0503-8d79-316c-ad7b-7dba302570fe}, created = {2023-05-03T13:16:38.931Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.252Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Chen2014}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {An increasing number of three-dimensional (3D) building models are being made available on Web-based model-sharing platforms. Motivated by the concept of data reuse, an encoding approach is proposed for 3D building model retrieval using point clouds acquired by airborne light detection and ranging (LiDAR) systems. To encode LiDAR point clouds with sparse, noisy, and incomplete sampling, we introduce a novel encoding scheme based on a set of low-frequency spherical harmonic basis functions. These functions provide compact representation and ease the encoding difficulty coming from inherent noises of point clouds. Additionally, a data filling and resampling technique is proposed to solve the aliasing problem caused by the sparse and incomplete sampling of point clouds. Qualitative and quantitative analyses of LiDAR data show a clear superiority of the proposed method over related methods. A cyber campus generated by retrieving 3D building models with airborne LiDAR point clouds demonstrates the feasibility of the proposed method. © 1999-2012 IEEE.}, bibtype = {article}, author = {Chen, Jyun Yuan and Lin, Chao Hung and Hsu, Po Chi and Chen, Chung Hao}, doi = {10.1109/TMM.2013.2286580}, journal = {IEEE Transactions on Multimedia}, number = {2} }
@inproceedings{ title = {Simulation of time-of-flight sensors using global illumination}, type = {inproceedings}, year = {2013}, id = {1df3bf65-7e8a-3bd7-af2f-34c93125e99b}, created = {2020-10-01T06:44:41.842Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-27T06:42:00.609Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Time-of-Flight (ToF) cameras use specialized sensors and modulated infrared light to simultaneously obtain depth, amplitude and intensity images. Depth images from such cameras suffer from various errors which exhibit a more complex behavior than traditional intensity images. Of these errors, the phenomenon of multi-reflection or multipath interference poses the biggest challenge to researchers. It is caused by indirect light paths between camera and light source and is therefore dependent on scene geometry. While simulated data can be used for ground truth evaluation and whitebox testing, current simulators do not model multipath effects. The method we present is capable of simulating all scene-dependant effects by taking global illumination into consideration. This is accomplished by modifying a bidirectional path tracing algorithm such that it takes the time-dependent propagation of modulated light in a scene into consideration. Furthermore, by combination of the proposed method with a previous hardware simulator we are capable of reproducing all effects in ToF cameras. The system was validated both on test targets with known real Time of Flight camera responses as well as qualitatively on a more complex room scene. The simulator as well as the source code is available at http://hci.iwr.uni-heidelberg.de/Benchmarks/.}, bibtype = {inproceedings}, author = {Meister, S. and Nair, R. and Kondermann, D.}, doi = {10.2312/PE.VMV.VMV13.033-040}, booktitle = {18th International Workshop on Vision, Modeling and Visualization, VMV 2013} }
@article{ title = {Unsupervised Feature Learning for RGB-D Based Object Recognition}, type = {article}, year = {2013}, pages = {387-402}, id = {7dec2a5f-56f5-3d2f-a8ad-123beedbf770}, created = {2020-11-24T10:01:08.522Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-24T10:01:11.963Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Recently introduced RGB-D cameras are capable of providing high quality synchronized videos of both color and depth. With its advanced sensing capabilities, this technology represents an opportunity to dramatically increase the capabilities of object recognition. It also raises the problem of developing expressive features for the color and depth channels of these sensors. In this paper we introduce hierarchical matching pursuit (HMP) for RGB-D data. HMP uses sparse coding to learn hierarchical feature representations from raw RGB-D data in an unsupervised way. Extensive experiments on various datasets indicate that the features learned with our approach enable superior object recognition results using linear support vector machines.}, bibtype = {article}, author = {Bo, Liefeng and Ren, Xiaofeng and Fox, Dieter}, doi = {10.1007/978-3-319-00065-7_27} }
@article{ title = {Combining object modeling and recognition for active scene exploration}, type = {article}, year = {2013}, pages = {2384-2391}, id = {61e8b213-3691-3595-bef4-4d35cebcd5a7}, created = {2021-02-09T17:05:46.883Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:51.923Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e7ec618f-fdb2-41d1-902a-30004eb3cde6}, private_publication = {false}, abstract = {Active scene exploration incorporates object recognition methods for analyzing a scene of partially known objects and exploration approaches for autonomous modeling of unknown parts. In this work, recognition, exploration, and planning methods are extended and combined in a single scene exploration system, enabling advanced techniques such as multi-view recognition from planned view positions and iterative recognition by integration of new objects from a scene. Here, a geometry based approach is used for recognition, i.e. matching objects from a database. Unknown objects are autonomously modeled and added to the recognition database. Next-Best-View planning is performed both for recognition and modeling. Moreover, 3D measurements are merged in a Probabilistic Voxel Space, which is utilized for planning collision free paths, minimal occlusion views, and verifying the poses of the recognized objects against all previous information. Experiments on an industrial robot with attached 3D sensors are shown for scenes with household and industrial objects. © 2013 IEEE.}, bibtype = {article}, author = {Kriegel, Simon and Brucker, Manuel and Marton, Zoltan Csaba and Bodenmuller, Tim and Suppa, Michael}, doi = {10.1109/IROS.2013.6696691}, journal = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {An efficient method for fully automatic 3D digitization of unknown objects}, type = {article}, year = {2013}, keywords = {3D digitization,Automatic scanning,Automation,Next Best View,View planning}, pages = {1152-1160}, volume = {64}, websites = {http://dx.doi.org/10.1016/j.compind.2013.04.005}, publisher = {Elsevier B.V.}, id = {29e8cf13-34c5-3fdc-a299-c9cc2d7d834a}, created = {2021-02-09T17:05:46.892Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.092Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e7ec618f-fdb2-41d1-902a-30004eb3cde6}, private_publication = {false}, abstract = {Our goal is to develop a complete and automatic scanning strategy with minimum prior information about the object shape. We aim to establish a methodology for the automation of the 3D digitization process. The paper presents a novel approach to determine the Next Best View (NBV) for an efficient reconstruction of highly accurate 3D models. Our method is based on the classification of the acquired surfaces into Well Visible and Barely Visible combined with a best view selection algorithm based on mean shift, which avoids unreachable positions. Our approach is applicable to all kinds of range sensors. To prove the efficiency and the robustness of our method, test objects are first scanned manually by experts in 3D digitization from the VECTEO company. The comparison of results between manual and automatic scanning shows that our method is very efficient and faster than trained experts. The 3D models of the different objects are obtained with a strongly reduced number of acquisitions while moving efficiently the ranging device. The obtained results prove the effectiveness and the versatility of our 3D reconstruction approach for industrial applications. © 2013 Elsevier B.V. All rights reserved.}, bibtype = {article}, author = {Khalfaoui, Souhaiel and Seulin, Ralph and Fougerolle, Yohan and Fofi, David}, doi = {10.1016/j.compind.2013.04.005}, journal = {Computers in Industry}, number = {9} }
@article{ title = {Model based training, detection and pose estimation of texture-less 3D objects in heavily cluttered scenes}, type = {article}, year = {2013}, pages = {548-562}, volume = {7724 LNCS}, id = {5cef53ab-d555-3ed1-a63f-342202941d68}, created = {2021-02-09T17:05:46.985Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:52.240Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {97a50287-04e8-4487-8537-7d1064b26657}, private_publication = {false}, abstract = {We propose a framework for automatic modeling, detection, and tracking of 3D objects with a Kinect. The detection part is mainly based on the recent template-based LINEMOD approach [1] for object detection. We show how to build the templates automatically from 3D models, and how to estimate the 6 degrees-of-freedom pose accurately and in real-time. The pose estimation and the color information allow us to check the detection hypotheses and improves the correct detection rate by 13% with respect to the original LINEMOD. These many improvements make our framework suitable for object manipulation in Robotics applications. Moreover we propose a new dataset made of 15 registered, 1100+ frame video sequences of 15 various objects for the evaluation of future competing methods. © 2013 Springer-Verlag.}, bibtype = {article}, author = {Hinterstoisser, Stefan and Lepetit, Vincent and Ilic, Slobodan and Holzer, Stefan and Bradski, Gary and Konolige, Kurt and Navab, Nassir}, doi = {10.1007/978-3-642-37331-2_42}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 1} }
@article{ title = {A Best Next View Selection Algorithm Incorporating a Quality Criterion}, type = {article}, year = {2013}, keywords = {3d scene reconstruction,best next views,quality criterion,sellation,sensor planning,sphere tes-,voxelmap}, pages = {78.1-78.10}, id = {57e24a35-8326-37f5-bc22-0f5d948c03aa}, created = {2021-02-09T17:05:47.109Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:34.411Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5439d198-93d5-4603-a7ce-201d423f231e,29a9a542-55a2-46d9-ba59-07497023a8e5}, private_publication = {false}, abstract = {This paper presents a method for solving the Best Next View problem.\nThis problem arises while gathering range data for the purpose of\nbuilding3D models of objects. The novelty of our solution is the\nintroduction of a quality criterion in addition to the visibility\ncriterion used by previous researchers. This quality criterion aims\nat obtaining views that improve the overall range data quality of\nthe imaged surfaces. Results demonstrate that this method selects\nviews which generate reasonable volumetric models for convex, concave\nand curved objects.}, bibtype = {article}, author = {Massios, N. A. and Fisher, R. B.}, doi = {10.5244/c.12.78} }
@article{ title = {US008379018B2}, type = {article}, year = {2013}, volume = {2}, id = {7c48c6d9-11a6-3c7c-b903-6e32cb916be1}, created = {2021-04-14T07:42:10.251Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T08:24:37.978Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Jose, San and Joshi, Pushkar P and City, Union and Andrews, James L and Jose, San and Data, Related U S Application}, number = {12} }
@article{ title = {Depth image filter for mixed and noisy pixel removal in RGB-D camera systems}, type = {article}, year = {2013}, keywords = {RGB-D camera,depth image filter,distancetransform,mixed pixel}, pages = {681-689}, volume = {59}, id = {f556879f-ed89-3f57-9aaf-71dafaf1b39a}, created = {2021-04-15T07:14:48.197Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T07:14:52.502Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {The commercial RGB-D camera produces color images and their depth maps from a scene in real time. However, the active camera creates mixed depth data near the border of different objects, occasionally losing depth information of shiny and dark surfaces in the scene. Furthermore, noise is added to the depth map. In this paper, a new method is presented to resolve such mixed, lost, and noisy pixel problems of the RGB-D camera. In particular, mixed pixel areas are detected using common distance transform (CDT) values of color and depth pixels, and merged them to lost pixel regions. The merged regions are filled with neighboring depth information based on an edge-stopping convergence function; distance transform values of color edge pixels are used to form this function. In addition, a CDT-based joint multilateral filter (CDT-JMF) is used to remove noisy pixels. Experimental results show that the proposed method gives better performance than conventional hole filling methods and image filters. © 2013 IEEE.}, bibtype = {article}, author = {Kim, Sung Yeol and Kim, Manbae and Ho, Yo Sung}, doi = {10.1109/TCE.2013.6626256}, journal = {IEEE Transactions on Consumer Electronics}, number = {3} }
@article{ title = {On the difficulty of training recurrent neural networks}, type = {article}, year = {2013}, pages = {2347-2355}, id = {a3abc790-7a8a-3a56-90b0-cde9dac02ee4}, created = {2021-07-12T14:15:35.356Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:13.297Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {There are two widely known issues with properly training recurrent neural networks, the vanishing and the exploding gradient problems detailed in Bengio et al. (1994). In this paper we attempt to improve the understanding of the underlying issues by exploring these problems from an analytical, a geometric and a dynamical systems perspective. Our analysis is used to justify a simple yet effective solution. We propose a gradient norm clipping strategy to deal with exploding gradients and a soft constraint for the vanishing gradients problem. We validate empirically our hypothesis and proposed solutions in the experimental section. Copyright 2013 by the author(s).}, bibtype = {article}, author = {Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua}, journal = {30th International Conference on Machine Learning, ICML 2013}, number = {PART 3} }
@article{ title = {On the importance of initialization and momentum in deep learning}, type = {article}, year = {2013}, pages = {2176-2184}, id = {9cfc18e8-7e82-343f-a27f-96bd96409988}, created = {2021-07-12T14:15:35.413Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:10.846Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this paper, we show that when stochastic gradient descent with momentum uses a well-designed random initialization and a particular type of slowly increasing schedule for the momentum parameter, it can train both DNNs and RNNs (on datasets with long-term dependencies) to levels of performance that were previously achievable only with Hessian-Free optimization. We find that both the initialization and the momentum are crucial since poorly initialized networks cannot be trained with momentum and well-initialized networks perform markedly worse when the momentum is absent or poorly tuned. Our success training these models suggests that previous attempts to train deep and recurrent neural networks from random initializations have likely failed due to poor initialization schemes. Furthermore, carefully tuned momentum methods suffice for dealing with the curvature issues in deep and recurrent network training objectives without the need for sophisticated second-order methods. Copyright 2013 by the author(s).}, bibtype = {article}, author = {Sutskever, Ilya and Martens, James and Dahl, George and Hinton, Geoffrey}, journal = {30th International Conference on Machine Learning, ICML 2013}, number = {PART 3} }
@article{ title = {Generating Sequences With Recurrent Neural Networks}, type = {article}, year = {2013}, pages = {1-43}, websites = {http://arxiv.org/abs/1308.0850}, id = {5bb177b5-1292-3b75-aaf0-b51ad23d35eb}, created = {2021-07-12T14:15:35.512Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:17.286Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {This paper shows how Long Short-term Memory recurrent neural networks can be used to generate complex sequences with long-range structure, simply by predicting one data point at a time. The approach is demonstrated for text (where the data are discrete) and online handwriting (where the data are real-valued). It is then extended to handwriting synthesis by allowing the network to condition its predictions on a text sequence. The resulting system is able to generate highly realistic cursive handwriting in a wide variety of styles.}, bibtype = {article}, author = {Graves, Alex} }
@article{ title = {Representation learning: A review and new perspectives}, type = {article}, year = {2013}, keywords = {Boltzmann machine,Deep learning,autoencoder,feature learning,neural nets,representation learning,unsupervised learning}, pages = {1798-1828}, volume = {35}, id = {7c6ec643-6263-3d29-a460-33bbdada9f53}, created = {2022-03-23T06:17:59.434Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.554Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bengio2013}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, autoencoders, manifold learning, and deep networks. This motivates longer term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation, and manifold learning. © 1979-2012 IEEE.}, bibtype = {article}, author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal}, doi = {10.1109/TPAMI.2013.50}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {8} }
@article{ title = {Sources of Uncertainty in Intuitive Physics}, type = {article}, year = {2013}, keywords = {Intuitive physics,Probabilistic inference,Stochastic simulation,Uncertainty}, pages = {185-199}, volume = {5}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1111/tops.12009}, id = {7537a427-4ac7-3754-a744-40b8d40ceaf0}, created = {2022-03-28T09:45:04.067Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:19.606Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {smithSourcesUncertaintyIntuitive2013}, source_type = {article}, notes = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/tops.12009}, private_publication = {false}, abstract = {Recent work suggests that people predict how objects interact in a manner consistent with Newtonian physics, but with additional uncertainty. However, the sources of uncertainty have not been examined. In this study, we measure perceptual noise in initial conditions and stochasticity in the physical model used to make predictions. Participants predicted the trajectory of a moving object through occluded motion and bounces, and we compared their behavior to an ideal observer model. We found that human judgments cannot be captured by simple heuristics and must incorporate noisy dynamics. Moreover, these judgments are biased consistently with a prior expectation on object destinations, suggesting that people use simple expectations about outcomes to compensate for uncertainty about their physical models.}, bibtype = {article}, author = {Smith, Kevin A and Vul, Edward}, doi = {10.1111/tops.12009}, journal = {Topics in Cognitive Science}, number = {1} }
@article{ title = {Sparse localized deformation components}, type = {article}, year = {2013}, keywords = {data-driven animation,dimensionality reduction,editing captured animations,mesh deformation}, pages = {179:1--179:10}, volume = {32}, websites = {https://doi.org/10.1145/2508363.2508417}, month = {11}, id = {83e45e54-3ffd-37b9-a069-7afbcda54bce}, created = {2022-03-28T09:45:04.851Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:37.466Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {neumannSparseLocalizedDeformation2013}, source_type = {article}, private_publication = {false}, abstract = {We propose a method that extracts sparse and spatially localized deformation modes from an animated mesh sequence. To this end, we propose a new way to extend the theory of sparse matrix decompositions to 3D mesh sequence processing, and further contribute with an automatic way to ensure spatial locality of the decomposition in a new optimization framework. The extracted dimensions often have an intuitive and clear interpretable meaning. Our method optionally accepts user-constraints to guide the process of discovering the underlying latent deformation space. The capabilities of our efficient, versatile, and easy-to-implement method are extensively demonstrated on a variety of data sets and application contexts. We demonstrate its power for user friendly intuitive editing of captured mesh animations, such as faces, full body motion, cloth animations, and muscle deformations. We further show its benefit for statistical geometry processing and biomechanically meaningful animation editing. It is further shown qualitatively and quantitatively that our method outperforms other unsupervised decomposition methods and other animation parameterization approaches in the above use cases.}, bibtype = {article}, author = {Neumann, Thomas and Varanasi, Kiran and Wenger, Stephan and Wacker, Markus and Magnor, Marcus and Theobalt, Christian}, doi = {10.1145/2508363.2508417}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {Learning part-based templates from large collections of 3D shapes}, type = {article}, year = {2013}, keywords = {correspondence,model collections,part-based template,segmentation,shape analysis}, pages = {70:1--70:12}, volume = {32}, websites = {https://doi.org/10.1145/2461912.2461933}, month = {7}, id = {507cfdb5-363e-3b1e-b72f-24b7e0dec839}, created = {2022-03-28T09:45:05.487Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:11.197Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kimLearningPartbasedTemplates2013}, source_type = {article}, private_publication = {false}, abstract = {As large repositories of 3D shape collections continue to grow, understanding the data, especially encoding the inter-model similarity and their variations, is of central importance. For example, many data-driven approaches now rely on access to semantic segmentation information, accurate inter-model point-to-point correspondence, and deformation models that characterize the model collections. Existing approaches, however, are either supervised requiring manual labeling; or employ super-linear matching algorithms and thus are unsuited for analyzing large collections spanning many thousands of models. We propose an automatic algorithm that starts with an initial template model and then jointly optimizes for part segmentation, point-to-point surface correspondence, and a compact deformation model to best explain the input model collection. As output, the algorithm produces a set of probabilistic part-based templates that groups the original models into clusters of models capturing their styles and variations. We evaluate our algorithm on several standard datasets and demonstrate its scalability by analyzing much larger collections of up to thousands of shapes.}, bibtype = {article}, author = {Kim, Vladimir G and Li, Wilmot and Mitra, Niloy J and Chaudhuri, Siddhartha and DiVerdi, Stephen and Funkhouser, Thomas}, doi = {10.1145/2461912.2461933}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {A Robust Method for Rotation Estimation Using Spherical Harmonics Representation}, type = {article}, year = {2013}, pages = {2306-2316}, volume = {22}, publisher = {IEEE}, id = {60c76829-1bab-302e-8d11-fd21ddd8a538}, created = {2023-04-24T15:41:54.281Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.559Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Althloothi2013}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,23b9a2dc-9e79-457b-ab28-4640ef7aa6a3}, private_publication = {false}, bibtype = {article}, author = {Althloothi, Salah and Mahoor, Mohammad H and Voyles, Richard M and Member, Senior}, number = {6} }
@article{ title = {Inverse rendering of faces with a 3D morphable model}, type = {article}, year = {2013}, keywords = {Inverse rendering,face shape,texture and illumination analysis}, pages = {1080-1093}, volume = {35}, publisher = {IEEE}, id = {c2211f1d-218b-32db-a3c5-5944c87cd066}, created = {2023-05-03T13:16:39.399Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.916Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Aldrian2013}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In this paper, we present a complete framework to inverse render faces with a 3D Morphable Model (3DMM). By decomposing the image formation process into geometric and photometric parts, we are able to state the problem as a multilinear system which can be solved accurately and efficiently. As we treat each contribution as independent, the objective function is convex in the parameters and a global solution is guaranteed. We start by recovering 3D shape using a novel algorithm which incorporates generalization error of the model obtained from empirical measurements. We then describe two methods to recover facial texture, diffuse lighting, specular reflectance, and camera properties from a single image. The methods make increasingly weak assumptions and can be solved in a linear fashion. We evaluate our findings on a publicly available database, where we are able to outperform an existing state-of-the-art algorithm. We demonstrate the usability of the recovered parameters in a recognition experiment conducted on the CMU-PIE database. © 1979-2012 IEEE.}, bibtype = {article}, author = {Aldrian, Oswald and Smith, William A.P.}, doi = {10.1109/TPAMI.2012.206}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {5} }
@article{ title = {A new invariant descriptor for action recognition based on spherical harmonics}, type = {article}, year = {2013}, keywords = {Action recognition,Space time volume (STV),Spherical harmonics}, pages = {507-518}, volume = {16}, id = {e2535bbe-e59a-3332-a875-e4e6df887961}, created = {2023-05-03T13:16:40.053Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.937Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Razzaghi2013}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {The aim of this paper is to introduce a new descriptor for the spatio-temporal volume (STV). Human motion is completely represented by STV (action volume) which is constructed over successive frames by stacking human silhouettes in consecutive frames. Action volume comprehensively contains spatial and temporal information about an action. The main contribution of this paper is to propose a new affine invariant action volume descriptor based on a function of spherical harmonic coefficients. This means, it is invariant under rotation, non-uniform scaling and translation. In the 3D shape analysis literature, there have been a few attempts to use coefficients of spherical harmonics to describe a 3D shape. However, those descriptors are not affine invariant and they are only rotation invariant. In addition, the proposed approach employs a parametric form of spherical harmonics that handles genus zero surfaces regardless of whether they are stellar or not. Another contribution of this paper is the way that action volume is constructed. We applied the proposed descriptor to the KTH, Weizmann, IXMAS and Robust datasets and compared the performance of our algorithm to competing methods available in the literature. The results of our experiments show that our method has a comparable performance to the most successful and recent existing algorithms. © 2012 Springer-Verlag London Limited.}, bibtype = {article}, author = {Razzaghi, Parvin and Palhang, Maziar and Gheissari, Niloofar}, doi = {10.1007/s10044-012-0274-x}, journal = {Pattern Analysis and Applications}, number = {4} }
@article{ title = {Realistic Simulation of 3D Cloud}, type = {article}, year = {2013}, keywords = {Cloud rendering,Coupled map lattice (CML),Frequency domain,Natural scene,Spherical harmonics,Volume rendering}, pages = {331-340}, volume = {12}, id = {2db67af8-c6d7-3930-8fd0-ef5fbf14435c}, created = {2023-05-03T13:16:40.242Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.781Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Qiu2013}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {The digital creation of cloud is important for many applications in computer graphics, including outdoor simulations and the digital rendering of atmospheric effects. Unfortunately, it is still difficult to simulate realistic cloud with interactive frame rates due to its peculiar microstructures and complex physical process of formation. Realistic simulation of cloud turns to be one of the most challenging topics in computer graphics. In this paper, we present a method for simulating 3D cloud. The Coupled Map Lattice (CML) is adopted for the modeling of cloud, and the simulation of light scattering in clouds is achieved by using a series of spherical harmonics and spherical harmonic coefficients that represent incident-light distribution. A frequency domain volume-rendering algorithm combined with spherical harmonics is applied to implement fast rendering of cloud scenes. Experiments demonstrate that our method facilitates computing efficiency, while yielding realistic visual quality.}, bibtype = {article}, author = {Qiu, Hang and Chen, Lei Ting and Qiu, Guo Ping and Yang, Hao}, journal = {WSEAS Transactions on Computers}, number = {8} }
@article{ title = {Noise modelling and uncertainty propagation for TOF sensors}, type = {article}, year = {2012}, pages = {476-485}, volume = {7585 LNCS}, id = {e787c01f-b827-3db5-b6eb-a2c081cbfc14}, created = {2020-11-05T09:10:48.222Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-05T09:17:40.184Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {Time-of-Flight (TOF) cameras are active real time depth sensors. One issue of TOF sensors is measurement noise. In this paper, we present a method for providing the uncertainty associated to 3D TOF measurements based on noise modelling. Measurement uncertainty is the combination of pixel detection error and sensor noise. First, a detailed noise characterization is presented. Then, a continuous model which gives the noise's standard deviation for each depth-pixel is proposed. Finally, a closed-form approximation of 3D uncertainty from 2D pixel detection error is presented. An applicative example is provided that shows the use of our 3D uncertainty modelling on real data. © 2012 Springer-Verlag.}, bibtype = {article}, author = {Belhedi, Amira and Bartoli, Adrien and Bourgeois, Steve and Hamrouni, Kamel and Sayd, Patrick and Gay-Bellile, Vincent}, doi = {10.1007/978-3-642-33885-4_48}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, number = {PART 3} }
@article{ title = {Fast and robust normal estimation for point clouds with sharp features}, type = {article}, year = {2012}, pages = {1765-1774}, volume = {31}, id = {fee31116-dc03-394f-884f-debce2e0ae18}, created = {2020-11-13T11:34:37.174Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-13T11:35:12.624Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {This paper presents a new method for estimating normals on unorganized point clouds that preserves sharp features. It is based on a robust version of the Randomized Hough Transform (RHT). We consider the filled Hough transform accumulator as an image of the discrete probability distribution of possible normals. The normals we estimate corresponds to the maximum of this distribution. We use a fixed-size accumulator for speed, statistical exploration bounds for robustness, and randomized accumulators to prevent discretization effects. We also propose various sampling strategies to deal with anisotropy, as produced by laser scans due to differences of incidence. Our experiments show that our approach offers an ideal compromise between precision, speed, and robustness: It is at least as precise and noise-resistant as state-of-the-art methods that preserve sharp features, while being almost an order of magnitude faster. Besides, it can handle anisotropy with minor speed and precision losses.}, bibtype = {article}, author = {Boulch, Alexandre and Marlet, Renaud}, doi = {10.1111/j.1467-8659.2012.03181. x}, journal = {Eurographics Symposium on Geometry Processing}, number = {5} }
@article{ title = {Next-best-scan planning for autonomous 3D modeling}, type = {article}, year = {2012}, pages = {2850-2856}, id = {702ffb0a-5e40-3bd1-8d8e-6b77e8db29ec}, created = {2021-01-25T14:53:33.637Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-21T11:27:39.914Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kriegel2012}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,5439d198-93d5-4603-a7ce-201d423f231e,97a50287-04e8-4487-8537-7d1064b26657}, private_publication = {false}, abstract = {We present a next-best-scan (NBS) planning approach for autonomous 3D modeling. The system successively completes a 3D model from complex shaped objects by iteratively selecting a NBS based on previously acquired data. For this purpose, new range data is accumulated in-the-loop into a 3D surface (streaming reconstruction) and new continuous scan paths along the estimated surface trend are generated. Further, the space around the object is explored using a probabilistic exploration approach that considers sensor uncertainty. This allows for collision free path planning in order to completely scan unknown objects. For each scan path, the expected information gain is determined and the best path is selected as NBS. The presented NBS approach is tested with a laser striper system, attached to an industrial robot. The results are compared to state-of-the-art next-best-view methods. Our results show promising performance with respect to completeness, quality and scan time. © 2012 IEEE.}, bibtype = {article}, author = {Kriegel, Simon and Rink, Christian and Bodenmuller, Tim and Narr, Alexander and Suppa, Michael and Hirzinger, Gerd}, doi = {10.1109/IROS.2012.6385624}, journal = {IEEE International Conference on Intelligent Robots and Systems} }
@article{ title = {An autonomous six-DOF eye-in-hand system for in situ 3D object modeling}, type = {article}, year = {2012}, keywords = {Path planning for manipulators,range sensing}, pages = {82-100}, volume = {31}, id = {0a62bc3e-8f8c-3636-8959-9c26b53cdda2}, created = {2021-02-09T17:05:46.857Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:51.100Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e4a1ea0f-69ae-4053-94cc-503201fc6c67,97a50287-04e8-4487-8537-7d1064b26657}, private_publication = {false}, abstract = {We present an integrated and fully autonomous eye-in-hand system for 3D object modeling. The system hardware consists of a laser range scanner mounted on a six-DOF manipulator arm and the task is to autonomously build a 3D model of an object in situ where the object may not be moved and must be scanned in its original location. Our system assumes no knowledge of object shape or geometry other than that it is within a bounding box whose location and size are known a priori, and, furthermore, the environment is unknown. The overall planner integrates the three main algorithms in the system: one that finds the next best view (NBV) for modeling the object; one that finds the NBV for exploration, i.e. exploring the environment, so the arm can move to the modeling view pose; and finally a sensor-based path planner, that is able to find a collision-free path to the view configuration determined by either of the the two view planners. Our modeling NBV algorithm efficiently searches the five-dimensional view space to determine the best modeling viewpoint, while considering key constraints such as field of view (FOV), overlap, and occlusion. If the determined viewpoint is reachable, the sensor-based path planner determines a collision-free path to move the manipulator to the desired view configuration, and a scan of the object is taken. Since the workspace is initially unknown, in some phases, the exploration view planner is used to increase information about the reachability and also the status of the modeling view configurations, since the view configuration may lie in an unknown workspace. This is repeated until the object modeling is complete or the planner deems that no further progress can be made, and the system stops. We have implemented the system with a six-DOF powercube arm and a wrist mounted Hokuyo URG-04LX laser scanner. Our results show that the system is able to autonomously build a 3D model of an object in situ in an unknown environment. © SAGE Publications 2011.}, bibtype = {article}, author = {Torabi, Liila and Gupta, Kamal}, doi = {10.1177/0278364911425836}, journal = {International Journal of Robotics Research}, number = {1} }
@article{ title = {Depth image enhancement for Kinect using region growing and bilateral filter}, type = {article}, year = {2012}, pages = {3070-3073}, id = {22582e11-029f-3165-98f8-45129af0ab66}, created = {2021-04-15T07:14:48.196Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T07:14:53.524Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Microsoft's Kinect as a recent 3D sensor has attracted considerable research attention in the fields of computer vision and pattern recognition. But its depth image suffers from the problem of poor accuracy caused by invalid pixels, noise and unmatched edges. In this paper, an efficient approach is proposed to improve the quality of Kinect's depth image. Using its corresponding color image, the pixels with wrong depth values are detected and removed using a region growing method. To accurately estimate the values of invalid pixels, a joint bilateral filter is used to fill the holes. Considering the special noise property of Kinect sensor, an adaptive bilateral filter is proposed to effectively reduce the noise of the depth image. Experimental results show that the proposed method significantly improves the quality of depth image by successfully filling the holes, eliminating the unmatched edges and reducing the noise. © 2012 ICPR Org Committee.}, bibtype = {article}, author = {Chen, Li and Lin, Hui and Li, Shutao}, journal = {Proceedings - International Conference on Pattern Recognition}, number = {Icpr} }
@article{ title = {Methods for depth-map filtering in view-plus-depth 3D video representation}, type = {article}, year = {2012}, pages = {1-21}, volume = {2012}, id = {31124f66-c207-3cef-a84c-282159643d41}, created = {2021-04-15T07:21:24.175Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T07:21:27.075Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {View-plus-depth is a scene representation format where each pixel of a color image or video frame is augmented by per-pixel depth represented as gray-scale image (map). In the representation, the quality of the depth map plays a crucial role as it determines the quality of the rendered views. Among the artifacts in the received depth map, the compression artifacts are usually most pronounced and considered most annoying. In this article, we study the problem of post-processing of depth maps degraded by improper estimation or by block-transformbased compression. A number of post-filtering methods are studied, modified and compared for their applicability to the task of depth map restoration and post-filtering. The methods range from simple and trivial Gaussian smoothing, to in-loop deblocking filter standardized in H.264 video coding standard, to more comprehensive methods which utilize structural and color information from the accompanying color image frame. The latter group contains our modification of the powerful local polynomial approximation, the popular bilateral filter, and an extension of it, originally suggested for depth super-resolution. We further modify this latter approach by developing an efficient implementation of it. We present experimental results demonstrating high-quality filtered depth maps and offering practitioners options for highest-quality or better efficiency. © 2012 Smirnov et al.}, bibtype = {article}, author = {Smirnov, Sergey and Gotchev, Atanas and Egiazarian, Karen}, doi = {10.1186/1687-6180-2012-25}, journal = {Eurasip Journal on Advances in Signal Processing}, number = {1} }
@article{ title = {US20120330447A1}, type = {article}, year = {2012}, volume = {1}, id = {21bba4a4-610c-3af9-a552-ce67996937e0}, created = {2021-04-16T05:21:10.929Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-16T05:21:15.104Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Thomas, Paul and Walker, Bruce}, number = {19} }
@article{ title = {Functional maps: A flexible representation of maps between shapes}, type = {article}, year = {2012}, keywords = {Correspondence,Representation,Shape matching}, volume = {31}, id = {2749ddd8-a847-306c-bbea-13d04eb55101}, created = {2021-08-28T19:32:57.492Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-29T21:49:17.210Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {We present a novel representation of maps between pairs of shapes that allows for efficient inference and manipulation. Key to our approach is a generalization of the notion of map that puts in correspondence real-valued functions rather than points on the shapes. By choosing a multi-scale basis for the function space on each shape, such as the eigenfunctions of its Laplace-Beltrami operator, we obtain a representation of a map that is very compact, yet fully suitable for global inference. Perhaps more remarkably, most natural constraints on a map, such as descriptor preservation, landmark correspondences, part preservation and operator commutativity become linear in this formulation. Moreover, the representation naturally supports certain algebraic operations such as map sum, difference and composition, and enables a number of applications, such as function or annotation transfer without establishing pointto-point correspondences. We exploit these properties to devise an efficient shape matching method, at the core of which is a single linear solve. The new method achieves state-of-the-art results on an isometric shape matching benchmark. We also show how this representation can be used to improve the quality of maps produced by existing shape matching methods, and illustrate its usefulness in segmentation transfer and joint analysis of shape collections. © 2012 ACM 0730-0301/2012/08-ART30.}, bibtype = {article}, author = {Ovsjanikov, Maks and Ben-Chen, Mirela and Solomon, Justin and Butscher, Adrian and Guibas, Leonidas}, doi = {10.1145/2185520.2185526}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {Reeb graph computation through spectral clustering}, type = {article}, year = {2012}, pages = {017209}, volume = {51}, id = {8aa2b633-5f69-3bd7-9476-134abb4f6241}, created = {2022-01-14T16:04:11.930Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:20.175Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Ma, Teng}, doi = {10.1117/1.oe.51.1.017209}, journal = {Optical Engineering}, number = {1} }
@article{ title = {Real-time compression of point cloud streams}, type = {article}, year = {2012}, pages = {778-785}, publisher = {IEEE}, id = {610f582e-388d-3396-ac21-a3af32f79fad}, created = {2022-03-02T07:02:50.320Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-02T07:02:55.339Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {84eaadea-8864-4baf-9a7a-b5a2f5b96449}, private_publication = {false}, abstract = {We present a novel lossy compression approach for point cloud streams which exploits spatial and temporal redundancy within the point data. Our proposed compression framework can handle general point cloud streams of arbitrary and varying size, point order and point density. Furthermore, it allows for controlling coding complexity and coding precision. To compress the point clouds, we perform a spatial decomposition based on octree data structures. Additionally, we present a technique for comparing the octree data structures of consecutive point clouds. By encoding their structural differences, we can successively extend the point clouds at the decoder. In this way, we are able to detect and remove temporal redundancy from the point cloud data stream. Our experimental results show a strong compression performance of a ratio of 14 at 1 mm coordinate precision and up to 40 at a coordinate precision of 9 mm. © 2012 IEEE.}, bibtype = {article}, author = {Kammerl, Julius and Blodow, Nico and Rusu, Radu Bogdan and Gedikli, Suat and Beetz, Michael and Steinbach, Eckehard}, doi = {10.1109/ICRA.2012.6224647}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {Active co-analysis of a set of shapes}, type = {article}, year = {2012}, keywords = {active learning,semi-supervised learning}, pages = {165:1--165:10}, volume = {31}, websites = {https://doi.org/10.1145/2366145.2366184}, month = {11}, id = {a39e4455-86a0-3394-a3d7-c2e697e56a13}, created = {2022-03-28T09:45:04.972Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:21:39.353Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wangActiveCoanalysisSet2012}, source_type = {article}, private_publication = {false}, abstract = {Unsupervised co-analysis of a set of shapes is a difficult problem since the geometry of the shapes alone cannot always fully describe the semantics of the shape parts. In this paper, we propose a semi-supervised learning method where the user actively assists in the co-analysis by iteratively providing inputs that progressively constrain the system. We introduce a novel constrained clustering method based on a spring system which embeds elements to better respect their inter-distances in feature space together with the user-given set of constraints. We also present an active learning method that suggests to the user where his input is likely to be the most effective in refining the results. We show that each single pair of constraints affects many relations across the set. Thus, the method requires only a sparse set of constraints to quickly converge toward a consistent and error-free semantic labeling of the set.}, bibtype = {article}, author = {Wang, Yunhai and Asafi, Shmulik and van Kaick, Oliver and Zhang, Hao and Cohen-Or, Daniel and Chen, Baoquan}, doi = {10.1145/2366145.2366184}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {UCSC - Spherical Harmonics Work Sheet}, type = {article}, year = {2012}, pages = {1-14}, volume = {1}, websites = {http://scipp.ucsc.edu/~haber/ph116C/SphericalHarmonics_12.pdf}, id = {b2cc8399-0a86-3643-9670-2b939e54635a}, created = {2022-10-10T13:41:15.043Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-10T13:41:24.307Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8410cff5-f764-42b2-a9b5-ead8d2dee5c8}, private_publication = {false}, bibtype = {article}, author = {ucsc.edu, undefined}, number = {5} }
@article{ title = {Verification of multi-view point-cloud registration for spherical harmonic cross-correlation}, type = {article}, year = {2012}, keywords = {3D imaging,point cloud,registration,rotation verification,spherical harmonics}, pages = {358-363}, id = {9bf81ae3-7fe1-3fcc-85d1-8ab8d5731c51}, created = {2023-04-24T07:38:01.272Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:55.325Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Larkins2012}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {Spherical harmonic cross-correlation is a robust registration algorithm that brings two point-clouds of the same scene into coarse rotational alignment. The found rotation however may not give the desired alignment, as misalignments can occur if there is not enough overlap between point-clouds, or if they contain a form of symmetry. We propose a verification method whose purpose is to determine if registration has failed for a priori unknown registration. The rotational transformation between multiple clouds must satisfy internal consistency, namely multiple rotational transformations are transitive. The rotation verification is performed using triplets of images, which are cross-referenced with each other to classify rotations individually. Testing is performed on a dataset of a priori known registrations. It is found that when the number of images or the percentage of correct rotations is increased, the number of correct rotation classifications improves. Even when tested with only four images and a correct rotation percentage of 17%, the rotation verification is still considered a viable method for classifying rotations. Spherical harmonic cross-correlation is benefited by rotation verification as it provides an additional approach for checking whether found rotations are correct. © 2012 ACM.}, bibtype = {article}, author = {Larkins, Robert L. and Cree, Michael J. and Dorrington, Adrian A.}, doi = {10.1145/2425836.2425906}, journal = {ACM International Conference Proceeding Series} }
@article{ title = {Verification of multi-view point-cloud registration for spherical harmonic cross-correlation}, type = {article}, year = {2012}, keywords = {3D imaging,point cloud,registration,rotation verification,spherical harmonics}, pages = {358-363}, id = {5206ca1f-acf6-3cba-b033-81e1a6d95b18}, created = {2023-05-03T13:16:39.429Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.883Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Larkins2012}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Spherical harmonic cross-correlation is a robust registration algorithm that brings two point-clouds of the same scene into coarse rotational alignment. The found rotation however may not give the desired alignment, as misalignments can occur if there is not enough overlap between point-clouds, or if they contain a form of symmetry. We propose a verification method whose purpose is to determine if registration has failed for a priori unknown registration. The rotational transformation between multiple clouds must satisfy internal consistency, namely multiple rotational transformations are transitive. The rotation verification is performed using triplets of images, which are cross-referenced with each other to classify rotations individually. Testing is performed on a dataset of a priori known registrations. It is found that when the number of images or the percentage of correct rotations is increased, the number of correct rotation classifications improves. Even when tested with only four images and a correct rotation percentage of 17%, the rotation verification is still considered a viable method for classifying rotations. Spherical harmonic cross-correlation is benefited by rotation verification as it provides an additional approach for checking whether found rotations are correct. © 2012 ACM.}, bibtype = {article}, author = {Larkins, Robert L. and Cree, Michael J. and Dorrington, Adrian A.}, doi = {10.1145/2425836.2425906}, journal = {ACM International Conference Proceeding Series} }
@article{ title = {Point-based manifold harmonics}, type = {article}, year = {2012}, keywords = {Laplace-Beltrami operator,Point-sampled surface,eigenfunction}, pages = {1693-1703}, volume = {18}, publisher = {IEEE}, id = {0ffa36ab-dcbb-3b6c-bdfb-71968f60d6ef}, created = {2023-05-03T13:16:40.356Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.763Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Liu2012}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This paper proposes an algorithm to build a set of orthogonal Point-Based Manifold Harmonic Bases (PB-MHB) for spectral analysis over point-sampled manifold surfaces. To ensure that PB-MHB are orthogonal to each other, it is necessary to have symmetrizable discrete Laplace-Beltrami Operator (LBO) over the surfaces. Existing converging discrete LBO for point clouds, as proposed by Belkin et al. [CHECK END OF SENTENCE], is not guaranteed to be symmetrizable. We build a new point-wisely discrete LBO over the point-sampled surface that is guaranteed to be symmetrizable, and prove its convergence. By solving the eigen problem related to the new operator, we define a set of orthogonal bases over the point cloud. Experiments show that the new operator is converging better than other symmetrizable discrete Laplacian operators (such as graph Laplacian) defined on point-sampled surfaces, and can provide orthogonal bases for further spectral geometric analysis and processing tasks. © 1995-2012 IEEE.}, bibtype = {article}, author = {Liu, Yang and Prabhakaran, Balakrishnan and Guo, Xiaohu}, doi = {10.1109/TVCG.2011.152}, journal = {IEEE Transactions on Visualization and Computer Graphics}, number = {10} }
@article{ title = {Consolidation of multiple depth maps}, type = {article}, year = {2011}, pages = {1120-1126}, id = {b17f4665-d265-3cda-8aff-7f223afcc04a}, created = {2021-04-26T05:50:56.289Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-26T05:51:00.680Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Consolidation of point clouds, including denoising, outlier removal and normal estimation, is an important pre-processing step for surface reconstruction techniques. We present a consolidation framework specialized on point clouds created by multiple frames of a depth camera. An adaptive view-dependent locally optimal projection operator denoises multiple depth maps while keeping their structure in two-dimensional grids. Depth cameras produce a systematic variation of noise scales along the depth axis. Adapting to different noise scales allows to remove noise in the point cloud and preserve well-defined details at the same time. Our framework provides additional consolidation steps for depth maps like normal estimation and outlier removal. We show how knowledge about the distribution of noise in the input data can be effectively used for improving point clouds. © 2011 IEEE.}, bibtype = {article}, author = {Reisner-Kollmann, Irene and Maierhofer, Stefan}, doi = {10.1109/ICCVW.2011.6130375}, journal = {Proceedings of the IEEE International Conference on Computer Vision}, number = {November 2011} }
@article{ title = {Wavelets on graphs via spectral graph theory}, type = {article}, year = {2011}, keywords = {Graph theory,Overcomplete wavelet frames,Spectral graph theory,Wavelets}, pages = {129-150}, volume = {30}, websites = {http://dx.doi.org/10.1016/j.acha.2010.04.005}, publisher = {Elsevier Inc.}, id = {11956157-79b1-325c-a50c-3b65815e3fb7}, created = {2021-08-20T07:55:13.973Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-28T11:02:59.241Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Hammond2011}, private_publication = {false}, abstract = {We propose a novel method for constructing wavelet transforms of functions defined on the vertices of an arbitrary finite weighted graph. Our approach is based on defining scaling using the graph analogue of the Fourier domain, namely the spectral decomposition of the discrete graph Laplacian L. Given a wavelet generating kernel g and a scale parameter t, we define the scaled wavelet operator Tgt=g(tL). The spectral graph wavelets are then formed by localizing this operator by applying it to an indicator function. Subject to an admissibility condition on g, this procedure defines an invertible transform. We explore the localization properties of the wavelets in the limit of fine scales. Additionally, we present a fast Chebyshev polynomial approximation algorithm for computing the transform that avoids the need for diagonalizing L. We highlight potential applications of the transform through examples of wavelets on graphs corresponding to a variety of different problem domains. © 2010 Elsevier Inc. All rights reserved.}, bibtype = {article}, author = {Hammond, David K. and Vandergheynst, Pierre and Gribonval, Rémi}, doi = {10.1016/j.acha.2010.04.005}, journal = {Applied and Computational Harmonic Analysis}, number = {2} }
@article{ title = {Shape google: Geometric words and expressions for invariant shape retrieval}, type = {article}, year = {2011}, volume = {30}, id = {0e6f2c7b-a6b2-3bb8-b787-8a0e9f311767}, created = {2021-08-28T19:32:57.592Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-28T19:33:09.076Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {The computer vision and pattern recognition communities have recently witnessed a surge of feature-based methods in object recognition and image retrieval applications. These methods allow representing images as collections of "visual words" and treat them using text search approaches following the "bag of features" paradigm. In this article, we explore analogous approaches in the 3D world applied to the problem of nonrigid shape retrieval in large databases. Using multiscale diffusion heat kernels as "geometric words," we construct compact and informative shape descriptors by means of the "bag of features" approach. We also show that considering pairs of "geometric words" ("geometric expressions") allows creating spatially sensitive bags of features with better discriminative power. Finally, adopting metric learning approaches, we show that shapes can be efficiently represented as binary codes. Our approach achieves state-of-the-art results on the SHREC 2010 large-scale shape retrieval benchmark. © 2011 ACM.}, bibtype = {article}, author = {Bronstein, Alexander M. and Bronstein, Michael M. and Guibas, Leonidas J. and Ovsjanikov, Maks}, doi = {10.1145/1899404.1899405}, journal = {ACM Transactions on Graphics}, number = {1} }
@article{ title = {Blended intrinsic maps}, type = {article}, year = {2011}, keywords = {Inter-surface correspondences,Inter-surface map}, pages = {1-12}, volume = {30}, id = {1d0ed47d-3955-3f8a-bc87-a0bb2c5b42d7}, created = {2021-08-28T19:32:57.606Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-29T21:49:16.954Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {This paper describes a fully automatic pipeline for finding an intrinsic map between two non-isometric, genus zero surfaces. Our approach is based on the observation that efficient methods exist to search for nearly isometric maps (e.g., MÖbius Voting or Heat Kernel Maps), but no single solution found with these methods provides low-distortion everywhere for pairs of surfaces differing by large deformations. To address this problem, we suggest using a weighted combination of these maps to produce a "blended map." This approach enables algorithms that leverage efficient search procedures, yet can provide the flexibility to handle large deformations. The main challenges of this approach lie in finding a set of candidate maps fmig and their associated blending weights fbi(p)g for every point p on the surface. We address these challenges specifically for conformal maps by making the following contributions. First, we provide a way to blend maps, defining the image of p as the weighted geodesic centroid of mi(p). Second, we provide a definition for smooth blending weights at every point p that are proportional to the area preservation of mi at p. Third, we solve a global optimization problem that selects candidate maps based both on their area preservation and consistency with other selected maps. During experiments with these methods, we find that our algorithm produces blended maps that align semantic features better than alternative approaches over a variety of data sets. © 2011 ACM.}, bibtype = {article}, author = {Kim, Vladimir G. and Lipman, Yaron and Funkhouser, Thomas}, doi = {10.1145/1964921.1964974}, journal = {ACM Transactions on Graphics}, number = {4} }
@article{ title = {3D is here: Point Cloud Library (PCL)}, type = {article}, year = {2011}, id = {1bdda4ac-4f2a-370b-a186-328c423ebf65}, created = {2021-10-15T12:44:59.955Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T07:46:13.187Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {With the advent of new, low-cost 3D sensing hardware such as the Kinect, and continued efforts in advanced point cloud processing, 3D perception gains more and more importance in robotics, as well as other fields. In this paper we present one of our most recent initiatives in the areas of point cloud perception: PCL (Point Cloud Library - http://pointclouds.org). PCL presents an advanced and extensive approach to the subject of 3D perception, and it's meant to provide support for all the common 3D building blocks that applications need. The library contains state-of-the art algorithms for: filtering, feature estimation, surface reconstruction, registration, model fitting and segmentation. PCL is supported by an international community of robotics and perception researchers. We provide a brief walkthrough of PCL including its algorithmic capabilities and implementation strategies. © 2011 IEEE.}, bibtype = {article}, author = {Rusu, Radu Bogdan and Cousins, Steve}, doi = {10.1109/ICRA.2011.5980567}, journal = {Proceedings - IEEE International Conference on Robotics and Automation}, number = {May} }
@article{ title = {Graph-based representations of point clouds}, type = {article}, year = {2011}, keywords = {Graph-based representations,Point clouds,Shape abstraction,Shape comparison}, pages = {151-164}, volume = {73}, websites = {http://dx.doi.org/10.1016/j.gmod.2011.03.002}, publisher = {Elsevier Inc.}, id = {8afb4f80-9107-3459-9178-69f7a60573af}, created = {2022-01-05T09:23:16.000Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:13.370Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {62c10c84-630d-4fdf-af89-e343e67460c7}, private_publication = {false}, abstract = {This paper introduces a skeletal representation, called Point Cloud Graph, that generalizes the definition of the Reeb graph to arbitrary point clouds sampled from m-dimensional manifolds embedded in the d-dimensional space. The proposed algorithm is easy to implement and the graph representation yields to an effective abstraction of the data. Finally, we present experimental results on point-sampled surfaces and volumetric data that show the robustness of the Point Cloud Graph to non-uniform point distributions and its usefulness for shape comparison. © 2011 Elsevier Inc. All rights reserved.}, bibtype = {article}, author = {Natali, Mattia and Biasotti, Silvia and Patané, Giuseppe and Falcidieno, Bianca}, doi = {10.1016/j.gmod.2011.03.002}, journal = {Graphical Models}, number = {5} }
@article{ title = {Képi információ mérése}, type = {article}, year = {2011}, id = {37c62b71-6225-3a31-9fd8-a7a2e687c5a8}, created = {2022-03-11T06:47:59.391Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-11T07:56:12.504Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Attila2011}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Attila, C S} }
@book{ title = {Számítógépes látás}, type = {book}, year = {2011}, id = {60b17d0f-cdfb-3d2a-ac5e-e649bb0f71a7}, created = {2022-03-11T07:57:12.661Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.377Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kato2011}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {book}, author = {Kató, Zoltán and Czúni, László} }
@book{ title = {Képfeldolgozás haladóknak}, type = {book}, year = {2011}, websites = {https://regi.tankonyvtar.hu/hu/tartalom/tamop425/0008_palagyi/Palagyi_Kepfeldolgozas_1_1.html}, id = {f4efbc82-0969-3da7-8b81-aca2c3d6418e}, created = {2022-03-15T09:58:27.602Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:10.422Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {PalagyiKalman2011}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {book}, author = {Palágyi Kálmán, undefined} }
@article{ title = {Leveraging social media networks for classification}, type = {article}, year = {2011}, pages = {447-478}, volume = {23}, websites = {https://doi.org/10.1007/s10618-010-0210-x}, month = {11}, id = {cc94dde0-5550-32f6-ac1d-ae689d1332a2}, created = {2022-03-28T09:45:01.039Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:19.207Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {tangLeveragingSocialMedia2011}, source_type = {article}, private_publication = {false}, abstract = {Social media has reshaped the way in which people interact with each other. The rapid development of participatory web and social networking sites like YouTube, Twitter, and Facebook, also brings about many data mining opportunities and novel challenges. In particular, we focus on classification tasks with user interaction information in a social network. Networks in social media are heterogeneous, consisting of various relations. Since the relation-type information may not be available in social media, most existing approaches treat these inhomogeneous connections homogeneously, leading to an unsatisfactory classification performance. In order to handle the network heterogeneity, we propose the concept of social dimension to represent actors’ latent affiliations, and develop a classification framework based on that. The proposed framework, SocioDim, first extracts social dimensions based on the network structure to accurately capture prominent interaction patterns between actors, then learns a discriminative classifier to select relevant social dimensions. SocioDim, by differentiating different types of network connections, outperforms existing representative methods of classification in social media, and offers a simple yet effective approach to integrating two types of seemingly orthogonal information: the network of actors and their attributes.}, bibtype = {article}, author = {Tang, Lei and Liu, Huan}, doi = {10.1007/s10618-010-0210-x}, journal = {Data Mining and Knowledge Discovery}, number = {3} }
@book{ title = {BIM Handbook: A Guide to Building Information Modeling for Owners, Managers, Designers, Engineers and Contractors}, type = {book}, year = {2011}, keywords = {Architecture / General}, month = {4}, publisher = {John Wiley \& Sons}, id = {451854fe-3bf6-3f60-b850-8650226ec9f4}, created = {2022-03-28T09:45:02.230Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:02.230Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {eastmanBIMHandbookGuide2011}, source_type = {book}, short_title = {BIM Handbook}, private_publication = {false}, abstract = {"The BIM Handbook is an extensively researched and meticulously written book, showing evidence of years of work rather than something that has been quickly put together in the course of a few months. It brings together most of the current information about BIM, its history, as well as its potential future in one convenient place, and can serve as a handy reference book on BIM for anyone who is involved in the design, construction, and operation of buildings and needs to know about the technologies that support it. The need for such a book is indisputable, and it is terrific that Chuck Eastman and his team were able to step up to the plate and make it happen. Thanks to their efforts, anyone in the AEC industry looking for a deeper understanding of BIM now knows exactly where to look for it." —AECbytes book review, August 28, 2008 (www.aecbytes.com/review/2008/BIMHandbook.html) DISCOVER BIM: A BETTER WAY TO BUILD BETTER BUILDINGS Building Information Modeling (BIM) offers a novel approach to design, construction, and facility management in which a digital representation of the building process is used to facilitate the exchange and interoperability of information in digital format. BIM is beginning to change the way buildings look, the way they function, and the ways in which they are designed and built. The BIM Handbook, Second Edition provides an in-depth understanding of BIM technologies, the business and organizational issues associated with its implementation, and the profound advantages that effective use of BIM can provide to all members of a project team. Updates to this edition include: Completely updated material covering the current practice and technology in this fast-moving field Expanded coverage of lean construction and its use of BIM, with special focus on Integrated Project Delivery throughout the book New insight on the ways BIM facilitates sustainable building New information on interoperability schemas and collaboration tools Six new case studies Painting a colorful and thorough picture of the state of the art in building information modeling, the BIM Handbook, Second Edition guides readers to successful implementations, helping them to avoid needless frustration and costs and take full advantage of this paradigm-shifting approach to construct better buildings that consume fewer materials and require less time, labor, and capital resources.}, bibtype = {book}, author = {Eastman, Charles M and Eastman, Chuck and Teicholz, Paul and Sacks, Rafael and Liston, Kathleen} }
@inproceedings{ title = {BlenSor: Blender Sensor Simulation Toolbox}, type = {inproceedings}, year = {2011}, keywords = {Complex Scene,Game Engine,Lidar Sensor,Obstacle Detection,Pitch Angle}, pages = {199-208}, publisher = {Springer}, city = {Berlin, Heidelberg}, series = {Lecture Notes in Computer Science}, id = {fbfded84-741c-325e-8a5e-93b6d4909116}, created = {2022-03-28T09:45:02.258Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:02.258Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gschwandtnerBlenSorBlenderSensor2011}, source_type = {inproceedings}, short_title = {BlenSor}, private_publication = {false}, abstract = {This paper introduces a novel software package for the simulation of various types of range scanners. The goal is to provide researchers in the fields of obstacle detection, range data segmentation, obstacle tracking or surface reconstruction with a versatile and powerful software package that is easy to use and allows to focus on algorithmic improvements rather than on building the software framework around it. The simulation environment and the actual simulations can be efficiently distributed with a single compact file. Our proposed approach facilitates easy regeneration of published results, hereby highlighting the value of reproducible research.}, bibtype = {inproceedings}, author = {Gschwandtner, Michael and Kwitt, Roland and Uhl, Andreas and Pree, Wolfgang}, editor = {Bebis, George and Boyle, Richard and Parvin, Bahram and Koracin, Darko and Wang, Song and Kyungnam, Kim and Benes, Bedrich and Moreland, Kenneth and Borst, Christoph and DiVerdi, Stephen and Yi-Jen, Chiang and Ming, Jiang}, doi = {10.1007/978-3-642-24031-7_20}, booktitle = {Advances in Visual Computing} }
@article{ title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization}, type = {article}, year = {2011}, pages = {2121-2159}, volume = {12}, month = {7}, id = {4e10f41a-7dd6-3b12-9f35-f31cc2767a04}, created = {2022-03-28T09:45:03.734Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:10.997Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {duchiAdaptiveSubgradientMethods2011}, source_type = {article}, private_publication = {false}, abstract = {We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods outperform state-of-the-art, yet non-adaptive, subgradient algorithms.}, bibtype = {article}, author = {Duchi, John and Hazan, Elad and Singer, Yoram}, journal = {The Journal of Machine Learning Research}, number = {null} }
@article{ title = {Local cortical surface complexity maps from spherical harmonic reconstructions}, type = {article}, year = {2011}, keywords = {Complexity,Gyrification,MRI,Morphology,Schizophrenia,Shape analysis}, pages = {961-973}, volume = {56}, websites = {http://dx.doi.org/10.1016/j.neuroimage.2011.02.007}, publisher = {Elsevier Inc.}, id = {c2f7c517-6a9a-37e1-95ce-d3e9f1591e73}, created = {2023-04-24T07:38:01.469Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.684Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yotter2011}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {Altered cortical surface complexity and gyrification differences may be a potentially sensitive marker for several neurodevelopmental disorders. We propose to use spherical harmonic (SPH) constructions to measure cortical surface folding complexity. First, we demonstrate that the complexity measure is accurate, by applying our SPH approach and the more traditional box-counting method to von Koch fractal surfaces with known fractal dimension (FD) values. The SPH approach is then applied to study complexity differences between 87 patients with DSM-IV schizophrenia (with stable psychopathology and treated with antipsychotic medication; 48 male/39 female; mean age = 35.5. years, SD = 11.0) and 108 matched healthy controls (68 male/40 female; mean age = 32.1 years, SD = 10.0). The global FD for the right hemisphere in the schizophrenia group was significantly reduced. Regionally, reduced complexity was also found in temporal, frontal, and cingulate regions in the right hemisphere, and temporal and prefrontal regions in the left hemisphere. These results are discussed in terms of previously published findings. Finally, the anatomical implications of a reduced FD are highlighted through comparison of two subjects with vastly different complexity maps. © 2011 Elsevier Inc.}, bibtype = {article}, author = {Yotter, Rachel A. and Nenadic, Igor and Ziegler, Gabriel and Thompson, Paul M. and Gaser, Christian}, doi = {10.1016/j.neuroimage.2011.02.007}, journal = {NeuroImage}, number = {3} }
@article{ title = {Ensemble of shape functions for 3D object classification}, type = {article}, year = {2011}, pages = {2987-2992}, publisher = {IEEE}, id = {ef2863ea-2d84-36ff-b22c-bb216301b4db}, created = {2023-05-03T13:16:38.982Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.289Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Wohlkinger2011}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This work addresses the problem of real-time 3D shape based object class recognition, its scaling to many categories and the reliable perception of categories. A novel shape descriptor for partial point clouds based on shape functions is presented, capable of training on synthetic data and classifying objects from a depth sensor in a single partial view in a fast and robust manner. The classification task is stated as a 3D retrieval task finding the nearest neighbors from synthetically generated views of CAD-models to the sensed point cloud with a Kinect-style depth sensor. The presented shape descriptor shows that the combination of angle, point-distance and area shape functions gives a significant boost in recognition rate against the baseline descriptor and outperforms the state-of-the-art descriptors in our experimental evaluation on a publicly available dataset of real-world objects in table scene contexts with up to 200 categories. © 2011 IEEE.}, bibtype = {article}, author = {Wohlkinger, Walter and Vincze, Markus}, doi = {10.1109/ROBIO.2011.6181760}, journal = {2011 IEEE International Conference on Robotics and Biomimetics, ROBIO 2011} }
@article{ title = {Spectral registration of noisy sonar data for underwater 3D Mapping}, type = {article}, year = {2011}, keywords = {3D mapping,Autonomous underwater vehicle (AUV),Remotely operated vehicle (ROV),Spectral registration,Underwater robotics}, pages = {307-331}, volume = {30}, id = {8fdc914e-e453-3c77-ac79-6e024bf4c8df}, created = {2023-05-03T13:16:39.209Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.190Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bulow2011}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {3D mapping is very challenging in the underwater domain, especially due to the lack of high resolution, low noise sensors. A new spectral registration method is presented that can determine the spatial 6 DOF transformation between pairs of very noisy 3D scans with only partial overlap. The approach is hence suited to cope with sonar as the predominant underwater sensor. The spectral registration method is based on Phase Only Matched Filtering (POMF) on non-trivially resampled spectra of the 3D data. Two extensive sets of experiments are presented. First, evaluations with simulated data are done where the type and amount of noise can be controlled and the ground truth transformations between scans are known. Second, real world data from a Tritech Eclipse sonar is used. Concretely, 18 sonar scans of a large structure in form of a flood gate and a lock in the river Lesum in Bremen are used for 3D mapping. In doing so, the spectral registration method is compared to two other methods suited for noisy 3D registrations, namely Iterative Closest Point (ICP) and plane-based registration. It is shown that the spectral registration method performs very well in terms of the resulting 3D map as well as its runtimes. © Springer Science+Business Media LLC 2011.}, bibtype = {article}, author = {Bülow, Heiko and Birk, Andreas}, doi = {10.1007/s10514-011-9221-8}, journal = {Autonomous Robots}, number = {3} }
@article{ title = {Viewpoint invariants from three-dimensional data: The role of reflection in human activity understanding}, type = {article}, year = {2011}, pages = {57-62}, publisher = {IEEE}, id = {d0519746-c904-3371-9546-b912c30b2126}, created = {2023-05-03T13:16:39.428Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.879Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kakarala2011}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Human activity understanding from three-dimensional data, such as from depth cameras, requires viewpoint-invariant matching. In this paper, we propose a new method of constructing invariants that allows distinction between isometries based on rotation, which preserve handedness, and those that involve reflection, which reverse right and left hands. The state-of-the-art in viewpoint invariants uses either global descriptors such as moments or spherical harmonic magnitudes, or relies on local methods such as feature matching. None of those methods are able to easily distinguish rotations from reflections, which is essential to understand left vs right handed gestures. We show that the distinction between rotation and reflection is contained in the imaginary part of certain weighted inner-products of moment vectors. We show how reflection-sensing viewpoint invariants may be applied to depth-map data for understanding activity data. © 2011 IEEE.}, bibtype = {article}, author = {Kakarala, Ramakrishna and Kaliamoorthi, Prabhu and Li, Wanqing}, doi = {10.1109/CVPRW.2011.5981785}, journal = {IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops} }
@article{ title = {Harmonic point cloud orientation}, type = {article}, year = {2011}, keywords = {Gradient fields,Harmonic functions,LaplaceBeltrami operator,Normal orientation,Point clouds,Surface reconstruction}, pages = {492-499}, volume = {35}, websites = {http://dx.doi.org/10.1016/j.cag.2011.03.012}, publisher = {Elsevier}, id = {7bbddab3-32d1-3402-803b-a63ea69f3857}, created = {2023-05-03T13:16:39.975Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.060Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Seversky2011}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In this work we propose a new method for estimating the normal orientation of unorganized point clouds. Consistent assignment of normal orientation is a challenging task in the presence of sharp features, nearby surface sheets, noise, undersampling, and missing data. Existing approaches, which consider local geometric properties often fail when operating on such point clouds as local neighborhood measures inherently face issues of robustness. Our approach circumvents these issues by orienting normals based on globally smooth functions defined on point clouds with measures that depend only on single points. More specifically, we consider harmonic functions, or functions which lie in the kernel of the point cloud LaplaceBeltrami operator. Each harmonic function in the set is used to define a gradient field over the point cloud. The problem of normal orientation is then cast as an assignment of cross-product ordering between gradient fields. Global smoothness ensures a highly consistent orientation, rendering our method extremely robust in the presence of imperfect point clouds.}, bibtype = {article}, author = {Seversky, Lee M. and Berger, Matt S. and Yin, Lijun}, doi = {10.1016/j.cag.2011.03.012}, journal = {Computers and Graphics (Pergamon)}, number = {3} }
@article{ title = {3D is here: Point Cloud Library (PCL)}, type = {article}, year = {2011}, pages = {1-4}, publisher = {IEEE}, id = {5b9697e8-dacb-3136-8bb7-66025c39633b}, created = {2023-06-23T09:19:58.498Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-23T09:20:04.256Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ab7b0cb1-7203-4b9c-9239-f9bdc3038cda}, private_publication = {false}, abstract = {With the advent of new, low-cost 3D sensing hardware such as the Kinect, and continued efforts in advanced point cloud processing, 3D perception gains more and more importance in robotics, as well as other fields. In this paper we present one of our most recent initiatives in the areas of point cloud perception: PCL (Point Cloud Library - http://pointclouds.org). PCL presents an advanced and extensive approach to the subject of 3D perception, and it's meant to provide support for all the common 3D building blocks that applications need. The library contains state-of-the art algorithms for: filtering, feature estimation, surface reconstruction, registration, model fitting and segmentation. PCL is supported by an international community of robotics and perception researchers. We provide a brief walkthrough of PCL including its algorithmic capabilities and implementation strategies. © 2011 IEEE.}, bibtype = {article}, author = {Rusu, Radu Bogdan and Cousins, Steve}, doi = {10.1109/ICRA.2011.5980567}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {Multipath interference compensation in time-of-flight camera images}, type = {article}, year = {2010}, keywords = {Calibration,Range imaging,Time-of-flight camera}, pages = {3583-3586}, id = {8d91ee90-74c1-374a-9f11-ad14a6105d3d}, created = {2020-11-05T09:10:50.862Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:53.806Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, abstract = {Multipath interference is inherent to the working principle of a Time-of-flight camera and can influence the measurements by several centimeters. Especially in applications that demand for high accuracy, such as object localization for robotic manipulation or ego-motion estimation of mobile robots, multipath interference is not tolerable. In this paper we formulate a multipath model in order to estimate the interference and correct the measurements. The proposed approach comprises the measured scene structure. All distracting surfaces are assumed to be Lambertian radiators and the directional interference is simulated for correction purposes. The positive impact of these corrections is experimentally demonstrated. © 2010 IEEE.}, bibtype = {article}, author = {Fuchs, Stefan}, doi = {10.1109/ICPR.2010.874}, journal = {Proceedings - International Conference on Pattern Recognition} }
@article{ title = {Multipath interference compensation in time-of-flight camera images}, type = {article}, year = {2010}, keywords = {Calibration,Range imaging,Time-of-flight camera}, pages = {3583-3586}, publisher = {IEEE}, id = {7c18a797-a8bf-38e8-bd5f-02fdf63eb7c5}, created = {2021-01-25T14:53:33.508Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T07:55:33.592Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,13d43b82-d9b4-40a8-9031-8e926a718ef0}, private_publication = {false}, abstract = {Multipath interference is inherent to the working principle of a Time-of-flight camera and can influence the measurements by several centimeters. Especially in applications that demand for high accuracy, such as object localization for robotic manipulation or ego-motion estimation of mobile robots, multipath interference is not tolerable. In this paper we formulate a multipath model in order to estimate the interference and correct the measurements. The proposed approach comprises the measured scene structure. All distracting surfaces are assumed to be Lambertian radiators and the directional interference is simulated for correction purposes. The positive impact of these corrections is experimentally demonstrated. © 2010 IEEE.}, bibtype = {article}, author = {Fuchs, Stefan}, doi = {10.1109/ICPR.2010.874}, journal = {Proceedings - International Conference on Pattern Recognition} }
@article{ title = {Active perception and scene modeling by planning with probabilistic 6D object poses}, type = {article}, year = {2010}, pages = {1036-1043}, publisher = {IEEE}, id = {bd2c3713-a4e8-3970-bb27-7876310ceaab}, created = {2021-02-09T17:05:46.907Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:02:39.782Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Eidenberger2010}, folder_uuids = {ff5688af-48ab-463d-a850-e1b325757918}, private_publication = {false}, abstract = {This paper presents an approach to probabilistic active perception planning for scene modeling in cluttered and realistic environments. When dealing with complex, multiobject scenes with arbitrary object positions, the estimation of 6D poses including their expected uncertainties is essential. The scene model keeps track of the probabilistic object hypotheses over several sequencing sensing actions to represent the real object constellation. To improve detection results and to tackle occlusion problems a method for active planning is proposed which reasons about model and state transition uncertainties in continuous and high-dimensional domains. Information theoretic quality criteria are used for sequential decision making to evaluate probability distributions. The probabilistic planner is realized as a partially observable Markov decision process (POMDP). The active perception system for autonomous service robots is evaluated in experiments in a kitchen environment. In 80 test runs the efficiency and satisfactory behavior of the proposed methodology is shown in comparison to a random and a stepaside action selection strategy. The objects are selected from a large database consisting of 100 different household items. ©2010 IEEE.}, bibtype = {article}, author = {Eidenberger, Robert and Scharinger, Josef}, doi = {10.1109/IROS.2010.5651927}, journal = {IEEE/RSJ 2010 International Conference on Intelligent Robots and Systems, IROS 2010 - Conference Proceedings} }
@article{ title = {Scale-invariant heat kernel signatures for non-rigid shape recognition}, type = {article}, year = {2010}, pages = {1704-1711}, id = {55819212-1264-3016-8640-636ab9905943}, created = {2021-08-28T19:32:57.311Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-29T21:49:16.970Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {One of the biggest challenges in non-rigid shape retrieval and comparison is the design of a shape descriptor that would maintain invariance under a wide class of transformations the shape can undergo. Recently, heat kernel signature was introduced as an intrinsic local shape descriptor based on diffusion scale-space analysis. In this paper, we develop a scale-invariant version of the heat kernel descriptor. Our construction is based on a logarithmically sampled scale-space in which shape scaling corresponds, up to a multiplicative constant, to a translation. This translation is undone using the magnitude of the Fourier transform. The proposed scale-invariant local descriptors can be used in the bag-of-features framework for shape retrieval in the presence of transformations such as isometric deformations, missing data, topological noise, and global and local scaling. We get significant performance improvement over state-of-the-art algorithms on recently established non-rigid shape retrieval benchmarks. ©2010 IEEE.}, bibtype = {article}, author = {Bronstein, Michael M. and Kokkinos, Iasonas}, doi = {10.1109/CVPR.2010.5539838}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Parallel Poisson Disk Sampling with Spectrum Analysis on Surfaces}, type = {article}, year = {2010}, keywords = {GPU,Poisson disk sampling,manifold surface,mesh Laplacian,parallel computation,spectrum analysis}, pages = {1-10}, volume = {29}, id = {03087b0d-19af-31e3-ab1d-bd4c0a7827ee}, created = {2021-11-01T10:14:38.922Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.316Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Bowers2010}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {The ability to place surface samples with Poisson disk distribution can benefit a variety of graphics applications. Such a distribution satisfies the blue noise property, i.e. lack of low frequency noise and structural bias in the Fourier power spectrum. While many techniques are available for sampling the plane, challenges remain for sampling arbitrary surfaces. In this paper, we present new methods for Poisson disk sampling with spectrum analysis on arbitrary manifold surfaces. Our first contribution is a parallel dart throwing algorithm that generates high-quality surface samples at interactive rates. It is flexible and can be extended to adaptive sampling given a user-specified radius field. Our second contribution is a new method for analyzing the spectral quality of surface samples. Using the spectral mesh basis derived from the discrete mesh Laplacian operator, we extend standard concepts in power spectrum analysis such as radial means and anisotropy to arbitrary manifold surfaces. This provides a way to directly evaluate the spectral distribution quality of surface samples without requiring mesh parameterization. Finally, we implement our Poisson disk sampling algorithm on the GPU, and demonstrate practical applications involving interactive sampling and texturing on arbitrary surfaces. © 2010, ACM. All rights reserved.}, bibtype = {article}, author = {Bowers, John and Wang, Rui and Maletz, David and Wei, Li Yi}, doi = {10.1145/1882261.1866188}, journal = {ACM Transactions on Graphics}, number = {6} }
@article{ title = {High-dimensional spectral feature selection for 3D object recognition based on reeb graphs}, type = {article}, year = {2010}, pages = {119-128}, volume = {6218 LNCS}, id = {4e927668-3def-330a-91fe-1ddeb27a704c}, created = {2022-01-14T16:04:11.938Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:22.435Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {In this work we evaluate purely structural graph measures for 3D object classification. We extract spectral features from different Reeb graph representations and successfully deal with a multi-class problem. We use an information-theoretic filter for feature selection. We show experimentally that a small change in the order of selection has a significant impact on the classification performance and we study the impact of the precision of the selection criterion. A detailed analysis of the feature participation during the selection process helps us to draw conclusions about which spectral features are most important for the classification problem. © 2010 Springer-Verlag Berlin Heidelberg.}, bibtype = {article}, author = {Bonev, Boyan and Escolano, Francisco and Giorgi, Daniela and Biasotti, Silvia}, doi = {10.1007/978-3-642-14980-1_11}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)} }
@inproceedings{ title = {Rectified Linear Units Improve Restricted Boltzmann Machines}, type = {inproceedings}, year = {2010}, websites = {https://openreview.net/forum?id=rkb15iZdZB}, month = {1}, id = {842b9620-ecb2-3b79-b681-02419b3d22be}, created = {2022-03-28T09:45:03.954Z}, accessed = {2021-10-01}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:37.419Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {nairRectifiedLinearUnits2010}, source_type = {inproceedings}, private_publication = {false}, abstract = {Restricted Boltzmann machines were developed using binary stochastic hidden units. These can be generalized by replacing each binary unit by an infinite number of copies that all have the same...}, bibtype = {inproceedings}, author = {Nair, Vinod and Hinton, Geoffrey E} }
@article{ title = {Fast Inference in Sparse Coding Algorithms with Applications to Object Recognition}, type = {article}, year = {2010}, keywords = {Computer Science - Computer Vision and Pattern Re,Computer Science - Machine Learning}, websites = {http://arxiv.org/abs/1010.3467}, month = {10}, id = {145315f3-e39f-3a5d-8828-55775e4d7f19}, created = {2022-03-28T09:45:04.216Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:39.732Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kavukcuogluFastInferenceSparse2010}, source_type = {article}, notes = {arXiv: 1010.3467}, private_publication = {false}, abstract = {Adaptive sparse coding methods learn a possibly overcomplete set of basis functions, such that natural image patches can be reconstructed by linearly combining a small subset of these bases. The applicability of these methods to visual object recognition tasks has been limited because of the prohibitive cost of the optimization algorithms required to compute the sparse representation. In this work we propose a simple and efficient algorithm to learn basis functions. After training, this model also provides a fast and smooth approximator to the optimal representation, achieving even better accuracy than exact sparse coding algorithms on visual object recognition tasks.}, bibtype = {article}, author = {Kavukcuoglu, Koray and Ranzato, Marc'Aurelio and LeCun, Yann}, journal = {arXiv:1010.3467 [cs]} }
@inproceedings{ title = {OctoMap: A probabilistic, flexible, and compact 3D map representation for robotic systems}, type = {inproceedings}, year = {2010}, id = {eb73020a-2ba4-3122-a5e6-710e7f5bcf3b}, created = {2022-03-28T09:45:04.439Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:17.620Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wurmOctoMapProbabilisticFlexible2010}, source_type = {inproceedings}, short_title = {OctoMap}, private_publication = {false}, abstract = {Abstract—In this paper, we present an approach for modeling 3D environments based on octrees using a probabilistic occupancy estimation. Our technique is able to represent full 3D models including free and unknown areas. It is available as an open-source library to facilitate the development of 3D mapping systems. We also provide a detailed review of existing approaches to 3D modeling. Our approach was thoroughly evaluated using different real-world and simulated datasets. The results demonstrate that our approach is able to model the data probabilistically while, at the same time, keeping the memory requirement at a minimum. I.}, bibtype = {inproceedings}, author = {Wurm, Kai M and Hornung, Armin and Bennewitz, Maren and Stachniss, Cyrill and Burgard, Wolfram}, booktitle = {In Proc. of the ICRA 2010 workshop} }
@article{ title = {Stacked Denoising Autoencoders: Learning Useful Representations in a Deep Network with a Local Denoising Criterion}, type = {article}, year = {2010}, pages = {3371-3408}, volume = {11}, month = {12}, id = {43771b0e-2a29-3a40-a072-244a3a894412}, created = {2022-03-28T09:45:04.528Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:51.997Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {vincentStackedDenoisingAutoencoders2010}, source_type = {article}, short_title = {Stacked Denoising Autoencoders}, private_publication = {false}, abstract = {We explore an original strategy for building deep networks, based on stacking layers of denoising autoencoders which are trained locally to denoise corrupted versions of their inputs. The resulting algorithm is a straightforward variation on the stacking of ordinary autoencoders. It is however shown on a benchmark of classification problems to yield significantly lower classification error, thus bridging the performance gap with deep belief networks (DBN), and in several cases surpassing it. Higher level representations learnt in this purely unsupervised fashion also help boost the performance of subsequent SVM classifiers. Qualitative experiments show that, contrary to ordinary autoencoders, denoising autoencoders are able to learn Gabor-like edge detectors from natural image patches and larger stroke detectors from digit images. This work clearly establishes the value of using a denoising criterion as a tractable unsupervised objective to guide the learning of useful higher level representations.}, bibtype = {article}, author = {Vincent, Pascal and Larochelle, Hugo and Lajoie, Isabelle and Bengio, Yoshua and Manzagol, Pierre-Antoine}, journal = {The Journal of Machine Learning Research} }
@article{ title = {Parallel Poisson disk sampling with spectrum analysis on surfaces}, type = {article}, year = {2010}, keywords = {GPU,Poisson disk sampling,manifold surface,mesh Laplacian,parallel computation,spectrum analysis}, pages = {166:1--166:10}, volume = {29}, websites = {https://doi.org/10.1145/1882261.1866188}, month = {12}, id = {89c3364a-e8b6-39a6-a421-b04e86a0f5f8}, created = {2022-03-28T09:45:06.359Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:09.850Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {bowersParallelPoissonDisk2010}, source_type = {article}, private_publication = {false}, abstract = {The ability to place surface samples with Poisson disk distribution can benefit a variety of graphics applications. Such a distribution satisfies the blue noise property, i.e. lack of low frequency noise and structural bias in the Fourier power spectrum. While many techniques are available for sampling the plane, challenges remain for sampling arbitrary surfaces. In this paper, we present new methods for Poisson disk sampling with spectrum analysis on arbitrary manifold surfaces. Our first contribution is a parallel dart throwing algorithm that generates high-quality surface samples at interactive rates. It is flexible and can be extended to adaptive sampling given a user-specified radius field. Our second contribution is a new method for analyzing the spectral quality of surface samples. Using the spectral mesh basis derived from the discrete mesh Laplacian operator, we extend standard concepts in power spectrum analysis such as radial means and anisotropy to arbitrary manifold surfaces. This provides a way to directly evaluate the spectral distribution quality of surface samples without requiring mesh parameterization. Finally, we implement our Poisson disk sampling algorithm on the GPU, and demonstrate practical applications involving interactive sampling and texturing on arbitrary surfaces.}, bibtype = {article}, author = {Bowers, John and Wang, Rui and Wei, Li-Yi and Maletz, David}, doi = {10.1145/1882261.1866188}, journal = {ACM Transactions on Graphics}, number = {6} }
@inproceedings{ title = {Understanding the difficulty of training deep feedforward neural networks}, type = {inproceedings}, year = {2010}, pages = {249-256}, websites = {https://proceedings.mlr.press/v9/glorot10a.html}, month = {3}, publisher = {JMLR Workshop and Conference Proceedings}, id = {02bc1d82-6b45-3c3b-a194-7177f73d39ec}, created = {2022-03-28T09:45:06.490Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:37.649Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {glorotUnderstandingDifficultyTraining2010}, source_type = {inproceedings}, notes = {ISSN: 1938-7228}, private_publication = {false}, abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.}, bibtype = {inproceedings}, author = {Glorot, Xavier and Bengio, Yoshua}, booktitle = {Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics} }
@article{ title = {Spherical harmonic decomposition for surfaces of arbitrary topology}, type = {article}, year = {2010}, keywords = {Spherical harmonic decomposition,Spherical parameterization}, pages = {215-220}, publisher = {IEEE}, id = {66dfbe96-8ef1-382e-a182-b187966fc80f}, created = {2023-05-03T13:16:40.300Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.786Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Yu2010}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Spherical harmonics have many valuable theoretic and practical applications in data and signal processing and modeling. It decomposes a given function defined on a sphere into a set orthogonal spherical harmonics. However, the given signal/function needs to be defined on a sphere domain. This paper studies the spherical harmonic decomposition for functions defined on general 2-dimensional manifold surfaces. We parameterize a surface with non-trivial topology onto a sphere domain, upon which the spherical harmonic idecomposition can be conducted effectively. We demonstrate the effectiveness of our framework via progressive surface reconstruction. ©2010 IEEE.}, bibtype = {article}, author = {Yu, Wuyi and Ye, Tengfei and Li, Maoqing and Li, Xin}, doi = {10.1109/ICCSE.2010.5593652}, journal = {ICCSE 2010 - 5th International Conference on Computer Science and Education, Final Program and Book of Abstracts} }
@article{ title = {View-invariant gesture recognition using 3D optical flow and harmonic motion context}, type = {article}, year = {2010}, keywords = {Action recognition,Motion primitives,Optical flow,Spherical harmonics,Time-of-Flight camera,View-invariant}, pages = {1353-1361}, volume = {114}, websites = {http://dx.doi.org/10.1016/j.cviu.2010.07.012}, publisher = {Elsevier Inc.}, id = {925efb64-644c-3346-89a8-ba57ed2e5083}, created = {2023-05-03T13:16:40.474Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-08-08T11:39:19.431Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Holte2010}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This paper presents an approach for view-invariant gesture recognition. The approach is based on 3D data captured by a SwissRanger SR4000 camera. This camera produces both a depth map as well as an intensity image of a scene. Since the two information types are aligned, we can use the intensity image to define a region of interest for the relevant 3D data. This data fusion improves the quality of the motion detection and hence results in better recognition. The gesture recognition is based on finding motion primitives (temporal instances) in the 3D data. Motion is detected by a 3D version of optical flow and results in velocity annotated point clouds. The 3D motion primitives are represented efficiently by introducing motion context. The motion context is transformed into a view-invariant representation using spherical harmonic basis functions, yielding a harmonic motion context representation. A probabilistic Edit Distance classifier is applied to identify which gesture best describes a string of primitives. The approach is trained on data from one viewpoint and tested on data from a very different viewpoint. The recognition rate is 94.4% which is similar to the recognition rate when training and testing on gestures from the same viewpoint, hence the approach is indeed view-invariant. © 2010 Elsevier Inc. All rights reserved.}, bibtype = {article}, author = {Holte, M. B. and Moeslund, T. B. and Fihl, P.}, doi = {10.1016/j.cviu.2010.07.012}, journal = {Computer Vision and Image Understanding}, number = {12} }
@article{ title = {Close-range Scene Segmentation and Reconstruction of 3D Point Cloud Maps for Mobile Manipulation in Domestic Environments}, type = {article}, year = {2009}, pages = {6-11}, publisher = {IEEE}, id = {6acb7dfe-2edb-35ce-a831-bcbbeac1011c}, created = {2020-10-05T10:26:00.907Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-05T10:26:18.699Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Rusu, Radu Bogdan and Blodow, Nico and Marton, Zoltan Csaba and Beetz, Michael} }
@article{ title = {Comparison of surface normal estimation methodsfor range sensing applications}, type = {article}, year = {2009}, pages = {3206-3211}, id = {4aa8c308-250c-3b63-b939-702ace98deac}, created = {2021-01-15T06:56:02.033Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-15T06:56:05.616Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {As mobile robotics is gradually moving towards a, level of semantic environment understanding, robust 3D object, recognition plays an increasingly important role. One of the, most crucial prerequisites for object recognition is a set of fast, algorithms for geometry segmentation and extraction, which in, turn rely on surface normal vectors as a fundamental feature., Although there exists a plethora of different approaches for, estimating normal vectors from 3D point clouds, it is largely, unclear which methods are preferable for online processing on, a mobile robot. This paper presents a detailed analysis and, comparison of existing methods for surface normal estimation, with a special emphasis on the trade-off between quality and, speed. The study sheds light on the computational complexity, as well as the qualitative differences between methods and, provides guidelines on choosing the 'right' algorithm for the, robotics practitioner. The robustness of the methods with re-, spect to noise and neighborhood size is analyzed. All algorithms, are benchmarked with simulated as well as real 3D laser data, obtained from a mobile robot. © 2009 IEEE.}, bibtype = {article}, author = {Klasing, Klaas and Althoff, Daniel and Wollherr, Dirk and Buss, Martin}, doi = {10.1109/ROBOT.2009.5152493}, journal = {Proceedings - IEEE International Conference on Robotics and Automation}, number = {May} }
@article{ title = {A novel 3D classification system for canine impactions - The KPG index}, type = {article}, year = {2009}, keywords = {3D imaging,Canines,Cone beam}, pages = {291-296}, volume = {5}, id = {ce90765a-cf22-3240-829d-7bbc9c67006a}, created = {2021-01-22T10:39:37.999Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-22T10:39:41.886Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Background: 3D cone beam imaging allows localization of impacted canines, using spatial relationships, with excellent tissue contrast. The aim of this project was to assess the degree of difficulty for the treatment of impacted canines, based on the 3D information provided by cone beam imaging. Methods: 3D cone beam images taken from subjects with impacted canines were obtained from a private practice and school setting. Results: A novel measuring scale was devised, based on three different viewpoints, in order to grade the difficulty of impaction and the potential efficacy of treatment. Depending on its anatomical location, the cusp tip and the root tip are each given a number 0-5 in three dimensions taken from a pretreatment image. The sum of the cusp tip and root tip scores in the three views dictated our anticipated difficulty of treatment. Conclusions: A novel method of analysing impactions using cone beam imaging was proposed. This method utilizes the entire three views of a CBCT image. Copyright © 2009 John Wiley & Sons, Ltd.}, bibtype = {article}, author = {Chung, How Kau and Pan, Philip and Gallerano, Ron L. and English, Jeryl D.}, doi = {10.1002/rcs.260}, journal = {International Journal of Medical Robotics and Computer Assisted Surgery}, number = {3} }
@article{ title = {Comparison of surface normal estimation methodsfor range sensing applications}, type = {article}, year = {2009}, pages = {3206-3211}, id = {d50dbc05-e9ee-3644-990d-4adc1d140048}, created = {2021-01-26T07:00:26.501Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T14:09:31.051Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {As mobile robotics is gradually moving towards a, level of semantic environment understanding, robust 3D object, recognition plays an increasingly important role. One of the, most crucial prerequisites for object recognition is a set of fast, algorithms for geometry segmentation and extraction, which in, turn rely on surface normal vectors as a fundamental feature., Although there exists a plethora of different approaches for, estimating normal vectors from 3D point clouds, it is largely, unclear which methods are preferable for online processing on, a mobile robot. This paper presents a detailed analysis and, comparison of existing methods for surface normal estimation, with a special emphasis on the trade-off between quality and, speed. The study sheds light on the computational complexity, as well as the qualitative differences between methods and, provides guidelines on choosing the 'right' algorithm for the, robotics practitioner. The robustness of the methods with re-, spect to noise and neighborhood size is analyzed. All algorithms, are benchmarked with simulated as well as real 3D laser data, obtained from a mobile robot. © 2009 IEEE.}, bibtype = {article}, author = {Klasing, Klaas and Althoff, Daniel and Wollherr, Dirk and Buss, Martin}, doi = {10.1109/ROBOT.2009.5152493}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@article{ title = {A two-steps next-best-view algorithm for autonomous 3D object modeling by a humanoid robot}, type = {article}, year = {2009}, pages = {1159-1164}, publisher = {IEEE}, id = {5719224e-6ccd-3ba4-b836-4cbce364ef13}, created = {2021-02-09T17:05:47.017Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:52.639Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {b2811196-99f9-4028-9c0e-353a72fd14aa}, private_publication = {false}, abstract = {A novel approach is presented which aims at building autonomously visual models of unknown objects, using a humanoid robot. Previous methods have been proposed for the specific problem of the next-best-view during the modeling and the recognition process. However our approach differs as it takes advantage of humanoid specificities in terms of embedded vision sensor and redundant motion capabilities. In a previous work, another approach to this specific problem was presented which relies on a derivable formulation of the visual evaluation in order to integrate it with our posture generation method. However to get rid of some limitations we propose a new method, formulated using two steps: (i) an optimization algorithm without derivatives is used to find a camera pose which maximizes the amount of unknown data visible, and (ii) a whole robot posture is generated by using a different optimization method where the computed camera pose is set as a constraint on the robot head.© 2009 IEEE.}, bibtype = {article}, author = {Foissotte, Torea and Stasse, Olivier and Escande, Adrien and Wieber, Pierre Brice and Kheddar, Abderrahmane}, doi = {10.1109/ROBOT.2009.5152350}, journal = {Proceedings - IEEE International Conference on Robotics and Automation}, number = {i} }
@article{ title = {Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations}, type = {article}, year = {2009}, pages = {609-616}, id = {f15f6d36-e677-362b-8eb5-664cfb0925b9}, created = {2021-07-12T14:15:35.034Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:16:34.993Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {There has been much interest in unsupervised learning of hierarchical generative models such as deep belief networks. Scaling such models to full-sized, high-dimensional images remains a difficult problem. To address this problem, we present the convolutional deep belief network, a hierarchical generative model which scales to realistic image sizes. This model is translation-invariant and supports efficient bottom-up and top-down probabilistic inference. Key to our approach is probabilistic max-pooling, a novel technique which shrinks the representations of higher layers in a probabilistically sound way. Our experiments show that the algorithm learns useful high-level visual features, such as object parts, from unlabeled images of objects and natural scenes. We demonstrate excellent performance on several visual recognition tasks and show that our model can perform hierarchical (bottom-up and top-down) inference over full-sized images.}, bibtype = {article}, author = {Lee, Honglak and Grosse, Roger and Ranganath, Rajesh and Ng, Andrew Y.}, doi = {10.1145/1553374.1553453}, journal = {Proceedings of the 26th International Conference On Machine Learning, ICML 2009}, number = {November} }
@article{ title = {Fast Point Feature Histograms (FPFH) for 3D Registration}, type = {article}, year = {2009}, pages = {3212-3217}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, id = {6db01aa2-7d3d-3870-ae7d-ccdf34ccb466}, created = {2022-02-15T12:26:05.622Z}, accessed = {2022-02-15}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-16T08:38:43.971Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {In our recent work [1], [2], we proposed Point Feature Histograms (PFH) as robust multi-dimensional features which describe the local geometry around a point p for 3D point cloud datasets. In this paper, we modify their mathematical expressions and perform a rigorous analysis on their robustness and complexity for the problem of 3D registration for overlapping point cloud views. More concretely, we present several optimizations that reduce their computation times drastically by either caching previously computed values or by revising their theoretical formulations. The latter results in a new type of local features, called Fast Point Feature Histograms (FPFH), which retain most of the discriminative power of the PFH. Moreover, we propose an algorithm for the online computation of FPFH features for realtime applications. To validate our results we demonstrate their efficiency for 3D registration and propose a new sample consensus based method for bringing two datasets into the convergence basin of a local non-linear optimizer: SAC-IA (SAmple Consensus Initial Alignment).}, bibtype = {article}, author = {Rusu, Radu Bogdan and Blodow, Nico and Beetz, Michael}, doi = {10.1109/ROBOT.2009.5152473}, journal = {Proceedings - IEEE International Conference on Robotics and Automation} }
@inproceedings{ title = {A 3D Face Model for Pose and Illumination Invariant Face Recognition}, type = {inproceedings}, year = {2009}, keywords = {2D/3D fitting,Basel Face Model (BFM),Computer vision,Costs,Face detection,Face recognition,Image analysis,Image reconstruction,Image sensors,Lighting,Morphable Model,Power generation,Shape,database,generative 3D face models,identification,recognition,statistical models}, pages = {296-301}, month = {9}, id = {1684b311-fd75-3b40-8004-4932bf1379dd}, created = {2022-03-28T09:45:05.003Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:08:23.444Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {paysan3DFaceModel2009}, source_type = {inproceedings}, private_publication = {false}, abstract = {Generative 3D face models are a powerful tool in computer vision. They provide pose and illumination invariance by modeling the space of 3D faces and the imaging process. The power of these models comes at the cost of an expensive and tedious construction process, which has led the community to focus on more easily constructed but less powerful models. With this paper we publish a generative 3D shape and texture model, the Basel face model (BFM), and demonstrate its application to several face recognition task. We improve on previous models by offering higher shape and texture accuracy due to a better scanning device and less correspondence artifacts due to an improved registration algorithm. The same 3D face model can be fit to 2D or 3D images acquired under different situations and with different sensors using an analysis by synthesis method. The resulting model parameters separate pose, lighting, imaging and identity parameters, which facilitates invariant face recognition across sensors and data sets by comparing only the identity parameters. We hope that the availability of this registered face model will spur research in generative models. Together with the model we publish a set of detailed recognition and reconstruction results on standard databases to allow complete algorithm comparisons.}, bibtype = {inproceedings}, author = {Paysan, Pascal and Knothe, Reinhard and Amberg, Brian and Romdhani, Sami and Vetter, Thomas}, doi = {10.1109/AVSS.2009.58}, booktitle = {2009 Sixth IEEE International Conference on Advanced Video and Signal Based Surveillance} }
@book{ title = {Probabilistic Graphical Models: Principles and Techniques}, type = {book}, year = {2009}, keywords = {Computers / Artificial Intelligence / General}, month = {7}, publisher = {MIT Press}, id = {e2160a23-f069-32bb-9d82-6c08fa92f907}, created = {2022-03-28T09:45:05.164Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:05.164Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kollerProbabilisticGraphicalModels2009}, source_type = {book}, short_title = {Probabilistic Graphical Models}, notes = {Google-Books-ID: 7dzpHCHzNQ4C}, private_publication = {false}, abstract = {A general framework for constructing and using probabilistic models of complex systems that would enable a computer to use available information for making decisions.Most tasks require a person or an automated system to reason—to reach conclusions based on available information. The framework of probabilistic graphical models, presented in this book, provides a general approach for this task. The approach is model-based, allowing interpretable models to be constructed and then manipulated by reasoning algorithms. These models can also be learned automatically from data, allowing the approach to be used in cases where manually constructing a model is difficult or even impossible. Because uncertainty is an inescapable aspect of most real-world applications, the book focuses on probabilistic models, which make the uncertainty explicit and provide models that are more faithful to reality. Probabilistic Graphical Models discusses a variety of models, spanning Bayesian networks, undirected Markov networks, discrete and continuous models, and extensions to deal with dynamical systems and relational data. For each class of models, the text describes the three fundamental cornerstones: representation, inference, and learning, presenting both basic concepts and advanced techniques. Finally, the book considers the use of the proposed framework for causal reasoning and decision making under uncertainty. The main text in each chapter provides the detailed technical development of the key ideas. Most chapters also include boxes with additional material: skill boxes, which describe techniques; case study boxes, which discuss empirical cases related to the approach described in the text, including applications in computer vision, robotics, natural language understanding, and computational biology; and concept boxes, which present significant concepts drawn from the material in the chapter. Instructors (and readers) can group chapters in various combinations, from core topics to more technically advanced material, to suit their particular needs.}, bibtype = {book}, author = {Koller, Daphne and Friedman, Nir} }
@article{ title = {Fast Point Feature Histograms ( FPFH ) for 3D Registration}, type = {article}, year = {2009}, pages = {3212-3217}, publisher = {IEEE}, id = {43a94b31-885b-3ebe-937b-6773f73f3430}, created = {2022-03-30T06:41:59.851Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T06:42:04.406Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307}, private_publication = {false}, bibtype = {article}, author = {Rusu, Radu Bogdan and Blodow, Nico and Beetz, Michael} }
@article{ title = {OpenRAVE : A Planning Architecture for Autonomous Robotics}, type = {article}, year = {2008}, pages = {-34}, websites = {http://www.ri.cmu.edu/pub_files/pub4/diankov_rosen_2008_2/diankov_rosen_2008_2.pdf}, id = {bb8660f7-ff8d-33f8-91f8-99679f9abe56}, created = {2021-02-09T17:05:46.979Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:52.805Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6680b8ad-712a-4df5-976a-7f395b261a3d}, private_publication = {false}, abstract = {One of the challenges in developing real-world autonomous robots is the need for integrating and rigorously test- ing high-level scripting, motion planning, perception, and control algorithms. For this purpose, we introduce an open-source cross-platform software architecture called OpenRAVE, the Open Robotics and Animation Virtual Envi- ronment. OpenRAVE is targeted for real-world autonomous robot applications, and includes a seamless integration of 3-D simulation, visualization, planning, scripting and control. A plugin architecture allows users to easily write cus- tom controllers or extend functionality. With OpenRAVE plugins, any planning algorithm, robot controller, or sensing subsystem can be distributed and dynamically loaded at run-time, which frees developers from struggling with mono- lithic code-bases. Users of OpenRAVE can concentrate on the development of planning and scripting aspects of a problem without having to explicitly manage the details of robot kinematics and dynamics, collision detection, world updates, and robot control. The OpenRAVE architecture provides a flexible interface that can be used in conjunction with other popular robotics packages such as Player and ROS because it is focused on autonomous motion planning and high-level scripting rather than low-level control and message protocols. OpenRAVE also supports a powerful network scripting environment which makes it simple to control and monitor robots and change execution flow dur- ing run-time. One of the key advantages of open component architectures is that they enable the robotics research community to easily share and compare algorithms.}, bibtype = {article}, author = {Diankov, Rosen and Kuffner, James}, journal = {Robotics}, number = {July} }
@article{ title = {Kullback-leibler divergence estimation of continuous distributions}, type = {article}, year = {2008}, pages = {1666-1670}, id = {606a09be-def1-3414-ad43-a50ade3d056f}, created = {2021-09-21T07:33:07.669Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.996Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Perez-Cruz2008}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {We present a method for estimating the KL divergence between continuous densities and we prove it converges almost surely. Divergence estimation is typically solved estimating the densities first. Our main result shows this intermediate step is unnecessary and that the divergence can be either estimated using the empirical cdf or k-nearest-neighbour density estimation, which does not converge to the true measure for finite k. The convergence proof is based on describing the statistics of our estimator using waiting-times distributions, as the exponential or Erlang. We illustrate the proposed estimators and show how they compare to existing methods based on density estimation, and we also outline how our divergence estimators can be used for solving the two-sample problem. © 2008 IEEE.}, bibtype = {article}, author = {Pérez-Cruz, Fernando}, doi = {10.1109/ISIT.2008.4595271}, journal = {IEEE International Symposium on Information Theory - Proceedings} }
@article{ title = {Random projection trees and low dimensional manifolds}, type = {article}, year = {2008}, keywords = {Algorithms}, pages = {537-546}, volume = {1}, id = {9aea1c98-4c98-3c67-9d2c-540688133811}, created = {2021-10-26T08:17:02.625Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.646Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Dasgupta2008}, folder_uuids = {cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {We present a simple variant of the κ-d tree which automatically adapts to intrinsic low dimensional structure in data without having to explicitly learn this structure. Copyright 2008 ACM.}, bibtype = {article}, author = {Dasgupta, Sanjoy and Freund, Yoav}, doi = {10.1145/1374376.1374452}, journal = {Proceedings of the Annual ACM Symposium on Theory of Computing} }
@article{ title = {Collective Classification in Network Data}, type = {article}, year = {2008}, pages = {93}, volume = {29}, websites = {https://ojs.aaai.org/index.php/aimagazine/article/view/2157}, month = {9}, id = {b1e2770a-6339-3f86-bed5-ab81656ac8d0}, created = {2022-03-28T09:45:02.056Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:02:42.988Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {senCollectiveClassificationNetwork2008}, source_type = {article}, notes = {Number: 3}, private_publication = {false}, abstract = {Many real-world applications produce networked data such as the world-wide web (hypertext documents connected via hyperlinks), social networks (for example, people connected by friendship links), communication networks (computers connected via communication links) and biological networks (for example, protein interaction networks). A recent focus in machine learning research has been to extend traditional machine learning classification techniques to classify nodes in such networks. In this article, we provide a brief introduction to this area of research and how it has progressed during the past decade. We introduce four of the most widely used inference algorithms for classifying networked data and empirically compare them on both synthetic and real-world data.}, bibtype = {article}, author = {Sen, Prithviraj and Namata, Galileo and Bilgic, Mustafa and Getoor, Lise and Galligher, Brian and Eliassi-Rad, Tina}, doi = {10.1609/aimag.v29i3.2157}, journal = {AI Magazine}, number = {3} }
@inproceedings{ title = {Kullback-Leibler divergence estimation of continuous distributions}, type = {inproceedings}, year = {2008}, keywords = {Approximation methods,Convergence,Density measurement,Entropy,Estimation,Exponential distribution,Random variables}, pages = {1666-1670}, month = {7}, id = {dc65cd2d-9034-3cc0-9c4e-834abe5b8c82}, created = {2022-03-28T09:45:03.234Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:19.472Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {perez-cruzKullbackLeiblerDivergenceEstimation2008}, source_type = {inproceedings}, notes = {ISSN: 2157-8117}, private_publication = {false}, abstract = {We present a method for estimating the KL divergence between continuous densities and we prove it converges almost surely. Divergence estimation is typically solved estimating the densities first. Our main result shows this intermediate step is unnecessary and that the divergence can be either estimated using the empirical cdf or k-nearest-neighbour density estimation, which does not converge to the true measure for finite k. The convergence proof is based on describing the statistics of our estimator using waiting-times distributions, as the exponential or Erlang. We illustrate the proposed estimators and show how they compare to existing methods based on density estimation, and we also outline how our divergence estimators can be used for solving the two-sample problem.}, bibtype = {inproceedings}, author = {Perez-Cruz, Fernando}, doi = {10.1109/ISIT.2008.4595271}, booktitle = {2008 IEEE International Symposium on Information Theory} }
@inproceedings{ title = {Articulated mesh animation from multi-view silhouettes}, type = {inproceedings}, year = {2008}, keywords = {deformation,motion capture}, pages = {1-9}, websites = {https://doi.org/10.1145/1399504.1360696}, month = {8}, publisher = {Association for Computing Machinery}, city = {New York, NY, USA}, series = {SIGGRAPH '08}, id = {51e28c1c-3df4-3cb4-8d77-b97b9cf5ea10}, created = {2022-03-28T09:45:05.610Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:22:12.694Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {vlasicArticulatedMeshAnimation2008}, source_type = {inproceedings}, private_publication = {false}, abstract = {Details in mesh animations are difficult to generate but they have great impact on visual quality. In this work, we demonstrate a practical software system for capturing such details from multi-view video recordings. Given a stream of synchronized video images that record a human performance from multiple viewpoints and an articulated template of the performer, our system captures the motion of both the skeleton and the shape. The output mesh animation is enhanced with the details observed in the image silhouettes. For example, a performance in casual loose-fitting clothes will generate mesh animations with flowing garment motions. We accomplish this with a fast pose tracking method followed by nonrigid deformation of the template to fit the silhouettes. The entire process takes less than sixteen seconds per frame and requires no markers or texture cues. Captured meshes are in full correspondence making them readily usable for editing operations including texturing, deformation transfer, and deformation model learning.}, bibtype = {inproceedings}, author = {Vlasic, Daniel and Baran, Ilya and Matusik, Wojciech and Popović, Jovan}, doi = {10.1145/1399504.1360696}, booktitle = {ACM SIGGRAPH 2008 papers} }
@inbook{ type = {inbook}, year = {2008}, keywords = {Control Point,Disparity Function,Principle Component Analysis,Stereo Match,Stereo Match Algorithm}, pages = {248-276}, websites = {https://doi.org/10.1007/978-1-84628-907-1_13}, publisher = {Springer}, city = {London}, id = {5323f3b0-8a1a-36e8-b2b5-89a419ce3823}, created = {2022-03-28T09:45:05.917Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:23:41.076Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {zhangSpacetimeFacesHighResolution2008}, source_type = {incollection}, short_title = {Spacetime Faces}, private_publication = {false}, bibtype = {inbook}, author = {Zhang, Li and Snavely, Noah and Curless, Brian and Seitz, Steven M}, editor = {Deng, Zhigang and Neumann, Ulrich}, doi = {10.1007/978-1-84628-907-1_13}, chapter = {Spacetime Faces: High-Resolution Capture for\textasciitildeModeling and Animation}, title = {Data-Driven 3D Facial Animation} }
@article{ title = {Learning informative point classes for the acquisition of object model maps}, type = {article}, year = {2008}, pages = {643-650}, id = {34cd0f71-7150-3de5-9ef5-a6f8873bbe08}, created = {2022-03-30T06:41:59.856Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T06:43:27.441Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a6fefa10-ad39-4ee5-850c-dcbd4fed6307}, private_publication = {false}, abstract = {This paper proposes a set of methods for building informative and robust feature point representations, used for accurately labeling points in a 3D point cloud, based on the type of surface the point is lying on. The feature space comprises a multi-value histogram which characterizes the local geometry around a query point, is pose and sampling density invariant, and can cope well with noisy sensor data. We characterize 3D geometric primitives of interest and describe methods for obtaining discriminating features used in a machine learning algorithm. To validate our approach, we perform an in-depth analysis using different classifiers and show results with both synthetically generated datasets and real-world scans. © 2008 IEEE.}, bibtype = {article}, author = {Rusu, Radu Bogdan and Marton, Zoltan Csaba and Blodow, Nico and Beetz, Michael}, doi = {10.1109/ICARCV.2008.4795593}, journal = {2008 10th International Conference on Control, Automation, Robotics and Vision, ICARCV 2008} }
@article{ title = {Self-similarity based compression of point set surfaces with application to ray tracing}, type = {article}, year = {2008}, keywords = {Coding and information theory-Data compaction and,Computational geometry and object modeling,Computer graphics,Curve,Solid and object representations,Surface}, pages = {221-234}, volume = {32}, id = {3e27843f-f467-3633-bd9d-8299526670e6}, created = {2023-05-03T13:16:39.380Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.072Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Hubo2008}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Many real-world, scanned surfaces contain repetitive structures, like bumps, ridges, creases, and so on. We present a compression technique that exploits self-similarity within a point-sampled surface. Our method replaces similar surface patches with an instance of a representative patch. We use a concise shape descriptor to identify and cluster similar patches. Decoding is achieved through simple instancing of the representative patches. Encoding is efficient, and can be applied to large data sets consisting of millions of points. Moreover, our technique offers random access to the compressed data, making it applicable to ray tracing, and easily allows for storing additional point attributes, like normals. © 2008 Elsevier Ltd. All rights reserved.}, bibtype = {article}, author = {Hubo, Erik and Mertens, Tom and Haber, Tom and Bekaert, Philippe}, doi = {10.1016/j.cag.2008.01.012}, journal = {Computers and Graphics (Pergamon)}, number = {2} }
@article{ title = {Toward an efficient triangle-based spherical harmonics representation of 3D objects}, type = {article}, year = {2008}, keywords = {Efficient and direct computation,Implicit surfaces,Mesh compression and transmission,Spherical harmonics,Spherical parameterization,Star-shaped objects}, pages = {561-575}, volume = {25}, id = {d66b9b57-e6fa-3924-8aa0-6c145d57d791}, created = {2023-05-03T13:16:39.564Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.869Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mousa2008}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {In classical frequency-based surface decomposition, there is always a restriction about the genus number of the object to obtain the spherical harmonics decomposition of spherical functions representing these objects. Such spherical functions are intrinsically associated to star-shaped objects. In this paper, we present a new and efficient spherical harmonics decomposition for spherical functions defining 3D triangulated objects. Our results can be extended to any triangular object of any genus number after segmentation into star-shaped surface patches and recomposition of the results in the implicit framework. We demonstrate that the evaluation of the spherical harmonics coefficients can be performed by a Monte Carlo integration over the edges, which makes the computation more accurate and faster than previous techniques, and provides a better control over the precision error in contrast to the volumetric or surfacic voxel-based methods. We present several applications of our research, including fast spectral surface reconstruction from point clouds, surface compression, progressive transmission, local surface smoothing and interactive geometric texture transfer. © 2008 Elsevier B.V. All rights reserved.}, bibtype = {article}, author = {Mousa, M. H. and Chaine, R. and Akkouche, S. and Galin, E.}, doi = {10.1016/j.cagd.2008.06.004}, journal = {Computer Aided Geometric Design}, number = {8} }
@article{ title = {Spherical harmonics-based parametric deconvolution of 3D surface images using bending energy minimization}, type = {article}, year = {2008}, keywords = {Cell morphology,Deconvolution,Fluorescence microscopy,Image reconstruction,Image segmentation,Inverse problems,L-curve,Parametric deconvolution,Shape modelling,Shape parameterization,Spherical harmonics}, pages = {217-227}, volume = {12}, id = {9adde3c2-3b30-3800-bf3b-6e194d89cda3}, created = {2023-05-03T13:16:39.831Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:26.136Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Khairy2008}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {Numerical deconvolution of 3D fluorescence microscopy data yields sharper images by reversing the known optical aberrations introduced during the acquisition process. When additional prior information such as the topology and smoothness of the imaged object surface is available, the deconvolution can be performed by fitting a parametric surface directly to the image data. In this work, we incorporate such additional information into the deconvolution process and focus on a parametric shape description suitable for the study of organelles, cells and tissues. Such membrane-bound closed biological surfaces are often topologically equivalent to the sphere and can be parameterized as series expansions in spherical harmonic functions (SH). Because image data are noisy and the SH-parameterization is prone to the formation of high curvatures even at low expansion orders, the parametric deconvolution problem is ill-posed and must be regularized. We use the shape bending energy as a regularizing (smoothing) function, and determine the regularization parameter graphically with the help of the L-curve method. We demonstrate the complete deconvolution scheme, including the initial image segmentation, the calculation of a good starting surface and the construction of the L-curve, using real and synthetic image data. © 2007 Elsevier B.V. All rights reserved.}, bibtype = {article}, author = {Khairy, Khaled and Howard, Jonathon}, doi = {10.1016/j.media.2007.10.005}, journal = {Medical Image Analysis}, number = {2} }
@article{ title = {VIEW INVARIANT GESTURE RECOGNITION USING 3D MOTION PRIMITIVES M . B . Holte and T . B . Moeslund}, type = {article}, year = {2008}, pages = {797-800}, volume = {2}, id = {7343ddb0-a3f7-35a1-833c-b2c772728104}, created = {2023-05-03T13:16:40.528Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-03T13:19:01.262Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, bibtype = {article}, author = {} }
@article{ title = {Environmental effects on measurement uncertainties of time-of-flight cameras}, type = {article}, year = {2007}, pages = {113-116}, volume = {1}, id = {44d78a5a-5a2e-383a-a946-1e13ad1677e6}, created = {2020-11-05T09:10:48.220Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-05T09:11:03.946Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, abstract = {In this paper the effect the environment has on the SwissRanger SR3000 Time-Of-Flight camera is investigated. The accuracy of this camera is highly affected by the scene it is pointed at: Such as the reflective properties, color and gloss. Also the complexity of the scene has considerable effects on the accuracy. To mention a few: The angle of the objects to the emitted light and the scattering effects of near objects. In this paper a general overview of known such inaccuracy factors are described, followed by experiments illustrating the additional uncertainty factors. Specifically we give a better description of how a surface color intensity influences the depth measurement, and illustrate how multiple reflections influence the resulting depth measurement. © 2007 IEEE.}, bibtype = {article}, author = {Guomundsson, Sigurjón Árni and Aanæs, Henrik and Larsen, Rasmus}, doi = {10.1109/ISSCS.2007.4292664}, journal = {ISSCS 2007 - International Symposium on Signals, Circuits and Systems, Proceedings} }
@article{ title = {Improved Techniques for Grid Mapping With Rao-Blackwellized Particle Filters}, type = {article}, year = {2007}, pages = {34-46}, volume = {23}, websites = {http://ieeexplore.ieee.org/document/4084563/}, month = {2}, id = {a37edb9a-5613-3b08-ad1e-ebb442a1b016}, created = {2022-03-28T09:45:01.180Z}, accessed = {2022-02-18}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:47.339Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {grisettiImprovedTechniquesGrid2007}, source_type = {article}, private_publication = {false}, abstract = {Recently, Rao-Blackwellized particle filters have been introduced as an effective means to solve the simultaneous localization and mapping problem. This approach uses a particle filter in which each particle carries an individual map of the environment. Accordingly, a key question is how to reduce the number of particles. In this paper, we present adaptive techniques for reducing this number in a Rao-Blackwellized particle filter for learning grid maps. We propose an approach to compute an accurate proposal distribution taking into account not only the movement of the robot but also the most recent observation. This drastically decreases the uncertainty about the robot’s pose in the prediction step of the filter. Furthermore, we present an approach to selectively carry out resampling operations which seriously reduces the problem of particle depletion. Experimental results carried out with real mobile robots in large-scale indoor as well as in outdoor environments illustrate the advantages of our methods over previous approaches.}, bibtype = {article}, author = {Grisetti, Giorgio and Stachniss, Cyrill and Burgard, Wolfram}, doi = {10.1109/TRO.2006.889486}, journal = {IEEE Transactions on Robotics}, number = {1} }
@book{ title = {As-Rigid-As-Possible Surface Modeling}, type = {book}, year = {2007}, month = {1}, id = {2d8ff827-6de7-3e73-b405-e018e3315a24}, created = {2022-03-28T09:45:03.760Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:03.760Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sorkineAsRigidAsPossibleSurfaceModeling2007}, source_type = {book}, notes = {Journal Abbreviation: Symposium on Geometry Processing<br/>Pages: 116<br/>Publication Title: Symposium on Geometry Processing}, private_publication = {false}, abstract = {Abstract Modeling tasks, such as surface deformation and editing, can be analyzed by observing the local behavior of the surface. We argue that defining a modeling,operation by asking for rigidity of the local transformations,is useful in various settings. Such formulation leads to a non-linear, yet conceptually simple energy formulation, which is to be minimized by the deformed,surface under particular modeling constraints. We devise a simple iterative mesh editing scheme based on this principle, that leads to detail-preserving and intuitive deformations. Our algorithm is effective and notably easy to implement, making it attractive for practical modeling applications. Categories and Subject Descriptors (according to ACM CCS): I.3.5 [Computer Graphics]: Computational Geometry and Object Modeling ‐ geometric algorithms, languages, and systems}, bibtype = {book}, author = {Sorkine, Olga and Alexa, Marc}, doi = {10.1145/1281991.1282006} }
@article{ title = {Efficient spherical harmonics representation of 3D objects}, type = {article}, year = {2007}, pages = {248-255}, id = {50580e58-4085-3d0f-8b81-02ed4a076a7e}, created = {2023-04-24T07:38:01.540Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:57.331Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mousa2007}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {In this paper, we present a new and efficient spherical harmonics decomposition for spherical functions defining 3D triangulated objects. Such spherical functions are intrinsically associated to star-shaped objects. However, our results can be extended to any triangular object after segmentation into star-shaped surface patches and recomposition of the results in the implicit framework. There is thus no restriction about the genus number of the object. We demonstrate that the evaluation of the splierical harmonics coefficients can be performed by a Monte Carlo integration over the edges, which makes the computation more accurate and faster than previous techniques, and provides a better control over the precision error in contrast to the voxel-based methods. We present several applications of our research, including fast spectral surface reconstruction from point clouds, local surface smoothing and interactive geometric texture transfer. © 2007 IEEE.}, bibtype = {article}, author = {Mousa, M. and Chaine, R. and Akkouche, S. and Galin, E.}, doi = {10.1109/PG.2007.19}, journal = {Proceedings - Pacific Conference on Computer Graphics and Applications} }
@article{ title = {A spectral approach to shape-based retrieval of articulated 3D models}, type = {article}, year = {2007}, keywords = {3D shape retrieval,Bending invariance,Geodesic distance,Graph distance,Shape descriptor,Spectral embedding}, pages = {398-407}, volume = {39}, id = {9e0af5d0-7e8f-3e39-b084-fe7d9d98f8e0}, created = {2023-05-03T13:16:39.076Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:27.223Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Jain2007}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {We present an approach for robust shape retrieval from databases containing articulated 3D models. Each shape is represented by the eigenvectors of an appropriately defined affinity matrix, forming a spectral embedding which achieves normalization against rigid-body transformations, uniform scaling, and shape articulation (i.e., bending). Retrieval is performed in the spectral domain using global shape descriptors. On the McGill database of articulated 3D shapes, the spectral approach leads to an absolute improvement in retrieval performance for both the spherical harmonic and the light field shape descriptors. The best retrieval results are obtained using a simple and novel eigenvalue-based descriptor we propose. © 2007 Elsevier Ltd. All rights reserved.}, bibtype = {article}, author = {Jain, Varun and Zhang, Hao}, doi = {10.1016/j.cad.2007.02.009}, journal = {CAD Computer Aided Design}, number = {5} }
@article{ title = {Generalized multidimensional scaling: A framework for isometry-invariant partial matching}, type = {article}, year = {2006}, keywords = {Gromov-Hausdorff distance,Isometric embedding,Iterative-closest-point,Partial embedding}, pages = {1168-1172}, volume = {103}, id = {5be5657a-edcb-3643-bf02-63d625e6dcaa}, created = {2021-08-28T19:32:57.316Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-29T21:49:17.118Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {An efficient algorithm for isometry-invariant matching of surfaces is presented. The key idea is computing the minimum-distortion mapping between two surfaces. For this purpose, we introduce the generalized multidimensional scaling, a computationally efficient continuous optimization algorithm for finding the least distortion embedding of one surface into another. The generalized multidimensional scaling algorithm allows for both full and partial surface matching. As an example, it is applied to the problem of expression-invariant three-dimensional face recognition. © 2006 by The National Academy of Sciences of the USA.}, bibtype = {article}, author = {Bronstein, Alexander M. and Bronstein, Michael M. and Kimmel, Ron}, doi = {10.1073/pnas.0508601103}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, number = {5} }
@article{ title = {FST-based reconstruction of SB-models from non-uniformly sampled datasets on the sphere}, type = {article}, year = {2006}, volume = {2006}, id = {832a4c1a-c42a-3889-851d-ae6c2b5209d2}, created = {2023-05-03T13:16:40.471Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.552Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Tosic2006}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, abstract = {This paper proposes a new method for reconstruction of star-shaped 3D surfaces from scattered datasets, where such surfaces are considered as signals living in the space of square integrable functions on the unit sphere. We first propose a generalization of the Fourier transform on the sphere. A practical reconstruction method is then presented, which interpolates a spherical signal on an equiangular grid, from non-uniformly sampled dataset representing a 3D point cloud. The experiments show that the proposed interpolation method results in smoother surfaces higher reconstruction PSNRs than the nearest neighbor interpolation method.}, bibtype = {article}, author = {Tosic, Ivana and Frossard, Pascal}, journal = {25th PCS Proceedings: Picture Coding Symposium 2006, PCS2006} }
@article{ title = {On the normal vector estimation for point cloud data from smooth surfaces}, type = {article}, year = {2005}, keywords = {Directional tangent vectors,Local Voronoi mesh,Normal vector,Point cloud data,Voronoi diagram}, pages = {1071-1079}, volume = {37}, id = {b72475e3-06a7-3a40-86ea-488640ec17f0}, created = {2021-01-26T07:00:26.494Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-11T08:19:13.736Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {Reliable estimation of the normal vector at a discrete data point in a scanned cloud data set is essential to the correct implementation of modern CAD/CAM technologies when the continuous CAD model representation is not available. A new method based on fitted directional tangent vectors at the data point has been developed to determine its normal vector. A local Voronoi mesh, based on the 3D Voronoi diagram and the proposed mesh growing heuristic rules, is first created to identify the neighboring points that characterize the local geometry. These local Voronoi mesh neighbors are used to fit a group of quadric curves through which the directional tangent vectors are obtained. The normal vector is then determined by minimizing the variance of the dot products between a normal vector candidate and the associated directional tangent vectors. Implementation results from extensive simulated and practical point cloud data sets have demonstrated that the present method is robust and estimates normal vectors with reliable consistency in comparison with the existing plane fitting, quadric surface fitting, triangle-based area weighted average, and triangle-based angle weighted average methods. © 2004 Elsevier Ltd. All rights reserved.}, bibtype = {article}, author = {Ouyang, Daoshan and Feng, Hsi Yung}, doi = {10.1016/j.cad.2004.11.005}, journal = {CAD Computer Aided Design}, number = {10} }
@article{ title = {Vision sensor planning for 3-D model acquisition}, type = {article}, year = {2005}, keywords = {3-D modeling,Model acquisition,Sensor placement,Surface prediction,Trend surface,Viewpoint planning,Vision sensor}, pages = {894-904}, volume = {35}, id = {0195640a-9ac5-3567-afac-a4e8eab13e97}, created = {2021-02-09T17:05:47.032Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:52.955Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {aac2e051-6f93-420f-a008-16b02ef2a9d1}, private_publication = {false}, abstract = {A novel method is proposed in this paper for automatic acquisition of three-dimensional (3-D) models of unknown objects by an active vision system, in which the vision sensor is to be moved from one viewpoint to the next around the target to obtain its complete model. In each step, sensing parameters are determined automatically for incrementally building the 3-D target models. The method is developed by analyzing the target's trend surface, which is the regional feature of a surface for describing the global tendency of change. While previous approaches to trend analysis are usually focused on generating polynomial equations for interpreting regression surfaces in three dimensions, this paper proposes a new mathematical model for predicting the unknown area of the object surface. A uniform surface model is established by analyzing the surface curvatures. Furthermore, a criterion is defined to determine the exploration direction, and an algorithm is developed for determining the parameters of the next view. Implementation of the method is carried out to validate the proposed method. © 2005 IEEE.}, bibtype = {article}, author = {Chen, S. Y. and Li, Y. F.}, doi = {10.1109/TSMCB.2005.846907}, journal = {IEEE Transactions on Systems, Man, and Cybernetics, Part B: Cybernetics}, number = {5} }
@article{ title = {What Can Neural Networks Reason About?}, type = {article}, year = {2005}, pages = {13-19}, id = {2ece44ef-8968-3288-b780-7f18ad5ba293}, created = {2021-07-12T10:19:36.624Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:56.002Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16}, private_publication = {false}, abstract = {Neural networks have succeeded in many reasoning tasks. Empirically, these tasks require specialized network structures, e.g., Graph Neural Networks (GNNs) perform well on many such tasks, but less structured networks fail. Theoretically, there is limited understanding of why and when a network structure generalizes better than others, although they have equal expressive power. In this paper, we develop a framework to characterize which reasoning tasks a network can learn well, by studying how well its computation structure aligns with the algorithmic structure of the relevant reasoning process. We formally define this algorithmic alignment and derive a sample complexity bound that decreases with better alignment. This framework offers an explanation for the empirical success of popular reasoning models, and suggests their limitations. As an example, we unify seemingly different reasoning tasks, such as intuitive physics, visual question answering, and shortest paths, via the lens of a powerful algorithmic paradigm, dynamic programming (DP). We show that GNNs align with DP and thus are expected to solve these tasks. On several reasoning tasks, our theory is supported by empirical results. 1}, bibtype = {article}, author = {Keyulu Xu, undefined}, number = {January} }
@inproceedings{ title = {Improving Grid-based SLAM with Rao-Blackwellized Particle Filters by Adaptive Proposals and Selective Resampling}, type = {inproceedings}, year = {2005}, pages = {2432-2437}, websites = {https://ieeexplore.ieee.org/document/1570477/}, publisher = {IEEE}, city = {Barcelona, Spain}, id = {0bd7f738-a2a1-3353-96e3-5567ff7627f9}, created = {2022-03-28T09:45:00.802Z}, accessed = {2022-02-18}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:00:56.807Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {grisettiImprovingGridbasedSLAM2005}, source_type = {inproceedings}, private_publication = {false}, abstract = {Recently Rao-Blackwellized particle filters have been introduced as effective means to solve the simultaneous localization and mapping (SLAM) problem. This approach uses a particle filter in which each particle carries an individual map of the environment. Accordingly, a key question is how to reduce the number of particles. In this paper we present adaptive techniques to reduce the number of particles in a RaoBlackwellized particle filter for learning grid maps. We propose an approach to compute an accurate proposal distribution taking into account not only the movement of the robot but also the most recent observation. This drastically decrease the uncertainty about the robot’s pose in the prediction step of the filter. Furthermore, we present an approach to selectively carry out re-sampling operations which seriously reduces the problem of particle depletion. Experimental results carried out with mobile robots in large-scale indoor as well as in outdoor environments illustrate the advantages of our methods over previous approaches.}, bibtype = {inproceedings}, author = {Grisetti, G and Stachniss, C and Burgard, W}, doi = {10.1109/ROBOT.2005.1570477}, booktitle = {Proceedings of the 2005 IEEE International Conference on Robotics and Automation} }
@inproceedings{ title = {SCAPE: shape completion and animation of people}, type = {inproceedings}, year = {2005}, keywords = {animation,deformations,morphing,synthetic actors}, pages = {408-416}, websites = {https://doi.org/10.1145/1186822.1073207}, month = {7}, publisher = {Association for Computing Machinery}, city = {New York, NY, USA}, series = {SIGGRAPH '05}, id = {435fb458-3a5a-3ef4-8bab-1a47d2706abe}, created = {2022-03-28T09:45:03.429Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:04.105Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {anguelovSCAPEShapeCompletion2005}, source_type = {inproceedings}, short_title = {SCAPE}, private_publication = {false}, abstract = {We introduce the SCAPE method (Shape Completion and Animation for PEople)---a data-driven method for building a human shape model that spans variation in both subject shape and pose. The method is based on a representation that incorporates both articulated and non-rigid deformations. We learn a pose deformation model that derives the non-rigid surface deformation as a function of the pose of the articulated skeleton. We also learn a separate model of variation based on body shape. Our two models can be combined to produce 3D surface models with realistic muscle deformation for different people in different poses, when neither appear in the training set. We show how the model can be used for shape completion --- generating a complete surface mesh given a limited set of markers specifying the target shape. We present applications of shape completion to partial view completion and motion capture animation. In particular, our method is capable of constructing a high-quality animated surface model of a moving person, with realistic muscle deformation, using just a single static scan and a marker motion capture sequence of the person.}, bibtype = {inproceedings}, author = {Anguelov, Dragomir and Srinivasan, Praveen and Koller, Daphne and Thrun, Sebastian and Rodgers, Jim and Davis, James}, doi = {10.1145/1186822.1073207}, booktitle = {ACM SIGGRAPH 2005 Papers} }
@article{ title = {Pose Estimation of Randomly Organized Stator Housings using Structured Light and Harmonic Shape Contexts}, type = {article}, year = {2005}, id = {8a136ed2-6a55-32c5-b932-8970cfce18ba}, created = {2023-05-03T13:16:40.670Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-05-09T14:17:25.282Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kirkegaard2005}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143}, private_publication = {false}, bibtype = {article}, author = {Kirkegaard, Jakob} }
@article{ title = {Parts-based 3D object classification}, type = {article}, year = {2004}, volume = {2}, id = {fd7292ee-517e-3f33-9b3a-25a5ac7a3c22}, created = {2021-01-25T08:45:25.226Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T08:25:31.017Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {This paper presents a parts-based method for classifying scenes of 3D objects into a set of pre-determined object classes. Working at the part level, as opposed to the whole object level, enables a more flexible class representation and allows scenes in which the query object is significantly occluded to be classified. In our approach, parts are extracted from training objects and grouped into part classes using a hierarchical clustering algorithm. Each part class is represented as a collection of semi-local shape features and can be used to perform part class recognition. A mapping from part classes to object classes is derived from the learned part classes and known object classes. At run-time, a 3D query scene is sampled, local shape features are computed, and the object class is determined using the learned part classes and the part-to-object mapping. The approach is demonstrated by classifying novel 3D scenes of vehicles into eight classes.}, bibtype = {article}, author = {Huber, Daniel and Kapuria, Anuj and Donamukkala, Raghavendra and Hebert, Martial}, doi = {10.1109/cvpr.2004.1315148}, journal = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@article{ title = {Using aromatese inhibitors to induce ovulation in breast Ca Survivors}, type = {article}, year = {2004}, pages = {73-83}, volume = {49}, id = {8f2bfc72-00ab-3de1-8584-efb88922c409}, created = {2021-07-26T12:19:39.681Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:43.885Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Mitwally2004}, private_publication = {false}, bibtype = {article}, author = {Mitwally, Mohamed F. and Casper, Robert F.}, journal = {Contemporary Ob/Gyn}, number = {1} }
@article{ title = {Deformation transfer for triangle meshes}, type = {article}, year = {2004}, keywords = {Animation,Correspondence,Deformations}, pages = {399-405}, volume = {23}, websites = {https://doi.org/10.1145/1015706.1015736}, month = {8}, id = {bcbbbd22-3f1d-3d46-8d0a-af969335a06a}, created = {2022-03-28T09:45:01.039Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:18.081Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {sumnerDeformationTransferTriangle2004}, source_type = {article}, private_publication = {false}, abstract = {Deformation transfer applies the deformation exhibited by a source triangle mesh onto a different target triangle mesh. Our approach is general and does not require the source and target to share the same number of vertices or triangles, or to have identical connectivity. The user builds a correspondence map between the triangles of the source and those of the target by specifying a small set of vertex markers. Deformation transfer computes the set of transformations induced by the deformation of the source mesh, maps the transformations through the correspondence from the source to the target, and solves an optimization problem to consistently apply the transformations to the target shape. The resulting system of linear equations can be factored once, after which transferring a new deformation to the target mesh requires only a backsubstitution step. Global properties such as foot placement can be achieved by constraining vertex positions. We demonstrate our method by retargeting full body key poses, applying scanned facial deformations onto a digital character, and remapping rigid and non-rigid animation sequences from one mesh onto another.}, bibtype = {article}, author = {Sumner, Robert W and Popović, Jovan}, doi = {10.1145/1015706.1015736}, journal = {ACM Transactions on Graphics}, number = {3} }
@article{ title = {Estimating Surface Normals in Noisy Point Cloud Data}, type = {article}, year = {2003}, pages = {322-328}, id = {ac34895d-3fda-3d11-9cbf-b133eb2ecd86}, created = {2020-11-16T10:05:24.730Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-15T06:56:02.498Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6533efe4-7189-42a2-b4b6-a9f175595b19,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {In this paper we describe and analyze a method based on local least square fitting for estimating the normals at all sample points of a point cloud data (PCD) set, in the presence of noise. We study the effects of neighborhood size, curvature, sampling density, and noise on the normal estimation when the PCD is sampled from a smooth curve in R2 or a smooth surface in R3 and noise is added. The analysis allows us to find the optimal neighborhood size using other local information from the PCD. Experimental results are also provided.}, bibtype = {article}, author = {Mitra, Niloy J. and Nguyen, An}, doi = {10.3109/00206097909070068}, journal = {SCG '03: Proceedings of the nineteenth annual symposium on Computational geometry} }
@article{ title = {Pollen flora of Pakistan- XXXIII. Buxaceae}, type = {article}, year = {2003}, keywords = {Buxaceae,Pollen morphology,Sarcococca saligna}, pages = {61-62}, volume = {32}, id = {37ff21dc-19b6-349d-9261-bad75adb96ea}, created = {2021-01-25T08:45:25.099Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-28T19:32:59.672Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Perveen, Anjum and Qaiser, M.}, journal = {Bangladesh Journal of Botany}, number = {1} }
@article{ title = {NRC Publications Archive ( NPArC ) Archives des publications du CNRC ( NPArC ) View Planning for Automated 3D Object Reconstruction Inspection View Planning for Automated 3D Object Reconstruction and Inspection *}, type = {article}, year = {2003}, id = {a5b8e4a8-6991-390b-8c19-0a2a71346428}, created = {2021-01-25T14:53:33.637Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:53.090Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,9632dd11-8528-4cbd-a987-565a32faa2de}, private_publication = {false}, bibtype = {article}, author = {Terms, Read These}, number = {October} }
@article{ title = {NRC Publications Archive ( NPArC ) Archives des publications du CNRC ( NPArC ) View Planning for Automated 3D Object Reconstruction Inspection View Planning for Automated 3D Object Reconstruction and Inspection *}, type = {article}, year = {2003}, id = {ae789d06-fb92-3b51-bf2a-d4f93905ea4d}, created = {2021-02-09T17:05:47.025Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:05:47.025Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Terms, Read These} }
@article{ title = {Next best view system in a 3D object modeling task}, type = {article}, year = {2003}, keywords = {3-d,modeling,next best view,sensor data fusion,sensor planning}, pages = {306-311}, id = {6be31844-2652-3d8e-b094-4a85588ab7d0}, created = {2021-02-09T17:05:47.136Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-12-05T14:13:52.485Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {30d6eff3-286d-4f81-8f91-64574a8fa4e9,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {Sensor placement for 3D modeling is a growing area of computer\nvision and robotics. The objective of a sensor placement system is to\nmake task-directed decisions for optimal pose selection. We propose a\nnext best view solution to the sensor placement problem. Our algorithm\ncomputes the next best view by optimizing an objective function that\nmeasures the quantity of unknown information in each of a group of\npotential viewpoints. The potential views are either placed uniformly\naround the object or are calculated from the surface normals of the\noccupancy grid model. To initiate the collection of new data, the\noptimal pose is selected from the objective function calculation. The\nmodel is incrementally updated from the information acquired in each new\nview. This process terminates when the number of recovered voxels ceases\nto increase, yielding the final model. We tested two different\nalgorithms on 8 objects of various complexity, including objects with\nsimple concave, simple hole, and complex hole self-occlusions}, bibtype = {article}, author = {Wong, L.M. and Dumont, C. and Abidi, M.A.}, doi = {10.1109/cira.1999.810066} }
@article{ title = {Inductive Representation Learning on Large Graphs}, type = {article}, year = {2003}, pages = {59}, id = {b13db5f3-f045-3bec-983d-d6b1dcb617d7}, created = {2021-08-20T05:22:59.052Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-07T08:57:34.752Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a}, private_publication = {false}, bibtype = {article}, author = {Hamilton, William L}, journal = {Revue Pratique du Froid et du Conditionnement d'Air}, number = {920} }
@article{ title = {Instant architecture}, type = {article}, year = {2003}, keywords = {architecture,building design,grammars,modeling}, pages = {669-677}, volume = {22}, websites = {https://doi.org/10.1145/882262.882324}, month = {7}, id = {c46fcf7f-22ee-3b70-afc5-6a656c00af69}, created = {2022-03-28T09:45:03.701Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:06:10.041Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {wonkaInstantArchitecture2003}, source_type = {article}, private_publication = {false}, abstract = {This paper presents a new method for the automatic modeling of architecture. Building designs are derived using split grammars, a new type of parametric set grammar based on the concept of shape. The paper also introduces an attribute matching system and a separate control grammar, which offer the flexibility required to model buildings using a large variety of different styles and design ideas. Through the adaptive nature of the design grammar used, the created building designs can either be generic or adhere closely to a specified goal, depending on the amount of data available.}, bibtype = {article}, author = {Wonka, Peter and Wimmer, Michael and Sillion, François and Ribarsky, William}, doi = {10.1145/882262.882324}, journal = {ACM Transactions on Graphics}, number = {3} }
@article{ title = {Real-time simultaneous localisation and mapping with a single camera}, type = {article}, year = {2003}, pages = {1403-1410}, volume = {2}, id = {05b75020-2ae9-3849-8bd0-7fd19dfd126c}, created = {2022-09-13T08:14:28.121Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-13T08:14:36.192Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Ego-motion estimation for an agile single camera moving through general, unknown scenes becomes a much more challenging problem when real-time performance is required rather than under the off-line processing conditions under which most successful structure from motion work has been achieved. This task of estimating camera motion from measurements of a continuously expanding set of self-mapped visual features is one of a class of problems known as Simultaneous Localisation and Mapping (SLAM) in the robotics community, and we argue that such real-time mapping research, despite rarely being camera-based, is more relevant here than off-line structure from motion methods due to the more fundamental emphasis placed on propagation of uncertainty. We present a top-down Bayesian framework for single-camera localisation via mapping of a sparse set of natural features using motion modelling and an information-guided active measurement strategy, in particular addressing the difficult issue of real-time feature initialisation via a factored sampling approach. Real-time handling of uncertainty permits robust localisation via the creating and active measurement of a sparse map of landmarks such that regions can be re-visited after periods of neglect and localisation can continue through periods when few features are visible. Results are presented of real-time localisation for a hand-waved camera with very sparse prior scene knowledge and all processing carried out on a desktop PC.}, bibtype = {article}, author = {Davison, Andrew J.}, doi = {10.1109/iccv.2003.1238654}, journal = {Proceedings of the IEEE International Conference on Computer Vision} }
@article{ title = {Rotation invariant spherical harmonic representation of 3D shape descriptors}, type = {article}, year = {2003}, keywords = {I36 [Computer Graphics],Methodology and Tech-niques}, pages = {156-165}, websites = {http://dl.acm.org/citation.cfm?id=882392}, id = {9feca97e-e5ab-3089-8c13-c3154f73dafa}, created = {2023-04-24T07:38:01.472Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-04-24T15:41:57.106Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Kazhdan2003}, folder_uuids = {4a65115c-c8d7-4bb4-831a-b057db051143,f4d8f1ef-fdcb-4a5b-a626-6e2fea47fb6d}, private_publication = {false}, abstract = {One of the challenges in 3D shape matching arises from the fact that in many applications, models should be considered to be the same if they differ by a rotation. Consequently, when comparing two models, a similarity metric implicitly provides the measure of similarity at the optimal alignment. Explicitly solving for the optimal alignment is usually impractical. So, two general methods have been proposed for addressing this issue: (1) Every model is represented using rotation invariant descriptors. (2) Every model is described by a rotation dependent descriptor that is aligned into a canonical coordinate system defined by the model. In this paper, we describe the limitations of canonical alignment and discuss an alternate method, based on spherical harmonics, for obtaining rotation invariant representations. We describe the properties of this tool and show how it can be applied to a number of existing, orientation dependent descriptors to improve their matching performance. The advantages of this tool are two-fold: First, it improves the matching performance of many descriptors. Second, it reduces the dimensionality of the descriptor, providing a more compact representation, which in turn makes comparing two models more efficient}, bibtype = {article}, author = {Kazhdan, Michael and Funkhouser, Thomas and Rusinkiewicz, Szymon}, journal = {Proceedings of the 2003 …} }
@article{ title = {View planning for automated three-dimensional object reconstruction and inspection}, type = {article}, year = {2003}, keywords = {Object inspection,Object reconstruction,Range images,View planning}, pages = {64-96}, volume = {35}, id = {58a140b0-516e-3a72-990a-5acaafd73517}, created = {2023-07-07T07:37:19.599Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-07-07T07:37:24.900Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {e4a1ea0f-69ae-4053-94cc-503201fc6c67}, private_publication = {false}, abstract = {Laser scanning range sensors are widely used for high-precision, high-density three-dimensional (3D) reconstruction and inspection of the surface of physical objects. The process typically involves planning a set of views, physically altering the relative object-sensor pose, taking scans, registering the acquired geometric data in a common coordinate frame of reference, and finally integrating range images into a nonredundant model. Efficiencies could be achieved by automating or semiautomating this process. While challenges remain, there are adequate solutions to semiautomate the scan-register-integrate tasks. On the other hand, view planning remains an open problem-that is, the task of finding a suitably small set of sensor poses and configurations for specified reconstruction or inspection goals. This paper surveys and compares view planning techniques for automated 3D object reconstruction and inspection by means of active, triangulation-based range sensors. © 2003 ACM.}, bibtype = {article}, author = {Scott, William R. and Roth, Gerhard and Rivest, Jean François}, doi = {10.1145/641865.641868}, journal = {ACM Computing Surveys}, number = {1} }
@article{ title = {Geometry images}, type = {article}, year = {2002}, keywords = {remeshing,surface parametrization}, pages = {355-361}, id = {07419275-9479-3d54-8db7-24118f245de9}, created = {2022-01-07T06:48:45.534Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-07T06:48:49.890Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Gu2002}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, abstract = {Surface geometry is often modeled with irregular triangle meshes. The process of remeshing refers to approximating such geometry using a mesh with (semi)-regular connectivity, which has advantages for many graphics applications. However, current techniques for remeshing arbitrary surfaces create only semi-regular meshes. The original mesh is typically decomposed into a set of disk-like charts, onto which the geometry is parametrized and sampled. In this paper, we propose to remesh an arbitrary surface onto a completely regular structure we call a geometry image. It captures geometry as a simple 2D array of quantized points. Surface signals like normals and colors are stored in similar 2D arrays using the same implicit surface parametrization - - texture coordinates are absent. To create a geometry image, we cut an arbitrary mesh along a network of edge paths, and parametrize the resulting single chart onto a square. Geometry images can be encoded using traditional image compression algorithms, such as wavelet-based coders. Copyright © 2002 by the Association for Computing Machinery, Inc.}, bibtype = {article}, author = {Gu, Xianfeng and Gortler, Steven J. and Hoppe, Hugues}, doi = {10.1145/566570.566589}, journal = {Proceedings of the 29th Annual Conference on Computer Graphics and Interactive Techniques, SIGGRAPH '02} }
@inproceedings{ title = {Geometry images}, type = {inproceedings}, year = {2002}, keywords = {remeshing,surface parametrization}, pages = {355-361}, websites = {https://doi.org/10.1145/566570.566589}, month = {7}, publisher = {Association for Computing Machinery}, city = {New York, NY, USA}, series = {SIGGRAPH '02}, id = {064071af-2c02-3f0f-9d45-60bac7e4c29e}, created = {2022-03-28T09:45:01.542Z}, accessed = {2022-03-27}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:01.542Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {guGeometryImages2002}, source_type = {inproceedings}, private_publication = {false}, abstract = {Surface geometry is often modeled with irregular triangle meshes. The process of remeshing refers to approximating such geometry using a mesh with (semi)-regular connectivity, which has advantages for many graphics applications. However, current techniques for remeshing arbitrary surfaces create only semi-regular meshes. The original mesh is typically decomposed into a set of disk-like charts, onto which the geometry is parametrized and sampled. In this paper, we propose to remesh an arbitrary surface onto a completely regular structure we call a geometry image. It captures geometry as a simple 2D array of quantized points. Surface signals like normals and colors are stored in similar 2D arrays using the same implicit surface parametrization --- texture coordinates are absent. To create a geometry image, we cut an arbitrary mesh along a network of edge paths, and parametrize the resulting single chart onto a square. Geometry images can be encoded using traditional image compression algorithms, such as wavelet-based coders.}, bibtype = {inproceedings}, author = {Gu, Xianfeng and Gortler, Steven J and Hoppe, Hugues}, doi = {10.1145/566570.566589}, booktitle = {Proceedings of the 29th annual conference on Computer graphics and interactive techniques} }
@article{ title = {Cumulated Gain-based Evaluation of IR Techniques}, type = {article}, year = {2002}, pages = {2002}, volume = {20}, id = {06f09e69-a247-3f41-ba2b-2f0d8f106a77}, created = {2022-03-28T09:45:02.429Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:03:51.603Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {jarvelinCumulatedGainbasedEvaluation2002}, source_type = {article}, private_publication = {false}, abstract = {Modem large retrieval environments tend to overwhelm their users by their large output. Since all documents are not of equal relevance to their users, highly relevant documents should be identified and ranked first for presentation to the users. In order to develop IR techniques to this direction, it is necessary to develop evaluation approaches and methods that credit IR methods for their ability to retrieve highly relevant documents. This can be done by extending traditional evaluation methods, i.e., recall and precision based on binary relevance assessments, to graded relevance assessments. Alternatively, novel measures based on graded relevance assessments may be developed. This paper proposes three novel measures that compute the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. The first one accumulates the relevance scores of retrieved documents along the ranked result list. The second one is similar but applies a discount factor on the relevance scores in order to devaluate late-retrieved documents. The third one computes the relative-tothe -ideal performance of IR techniques, based on the cumulative gain they are able to yield. The novel measures are defined and discussed and then their use is demonstrated in a case study using TREC data - sample system run results for 20 queries in TREC-7. As relevance base we used novel graded relevance assessments on a four-point scale. The test results indicate that the proposed measures credit IR methods for their ability to retrieve highly relevant documents and allow testing of statistical significance of effectiveness differences. The graphs based on the measures also provide insight into the performance IR techniques and allow interpretation, e.g., from the user point of ...}, bibtype = {article}, author = {Järvelin, Kalervo and Kekäläinen, Jaana}, journal = {ACM Transactions on Information Systems} }
@inbook{ type = {inbook}, year = {2001}, pages = {1-7}, volume = {1}, month = {2}, id = {a4396955-89c9-33f1-a7df-968e2fa4f550}, created = {2022-03-28T09:45:05.851Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:23:42.456Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {borsIntroductionRadialBasis2001}, source_type = {incollection}, private_publication = {false}, abstract = {In this paper we provide a short overview of the Radial Basis Functions (RBF), their properties, the motivations behind their use and some of their applications. RBF's have been employed for functional approximation in time-series modeling and in pattern classification. They have been shown to implement the Bayesian rule and to model any continuous input-output mapping. RBF's are embedded in a two-layer neural network topology. We present the physical and statistical significance of the elements composing the network. We introduce a few RBF training algorithms and we show how RBF networks can be used in real applications.}, bibtype = {inbook}, author = {Bors, Adrian}, chapter = {Introduction of the Radial Basis Function (RBF) Networks} }
@inproceedings{ title = {Laplacian Eigenmaps and Spectral Techniques for Embedding and Clustering}, type = {inproceedings}, year = {2001}, volume = {14}, websites = {https://proceedings.neurips.cc/paper/2001/hash/f106b7f99d2cb30c3db1c3cc0fde9ccb-Abstract.html}, publisher = {MIT Press}, id = {44959409-ea44-30b4-9174-2492e08e2b12}, created = {2022-03-28T09:45:06.590Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:30.533Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {belkinLaplacianEigenmapsSpectral2001}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Belkin, Mikhail and Niyogi, Partha}, booktitle = {Advances in Neural Information Processing Systems} }
@article{ title = {A next-best-view system for autonomous 3-D object reconstruction}, type = {article}, year = {2000}, pages = {589-598}, volume = {30}, id = {1d513735-4130-30ac-aa21-ae407fe4676c}, created = {2021-02-09T17:05:47.089Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:32.980Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Banta2000}, folder_uuids = {5439d198-93d5-4603-a7ce-201d423f231e,15f05391-5ea9-4170-8df6-e4bd19deb4f3}, private_publication = {false}, abstract = {The focus of this paper is to design and implement a system capable of automatically reconstructing a prototype three-dimensional (3-D) model from a minimum number of range images of an object. Given an ideal 3-D object model, the system iteratively renders range and intensity images of the model from a specified position, assimilates the range information into a prototype model, and determines the sensor pose (position and orientation) from which an optimal amount of previously unrecorded information may be acquired. Reconstruction is terminated when the model meets a given threshold of accuracy. Such a system has applications in the context of robot navigation, manufacturing, or hazardous materials handling. The system has been tested successfully on several synthetic data models, and each set of results was found to be reasonably consistent with an intuitive human search. The number of views necessary to reconstruct an adequate 3-D prototype depends on the complexity of the object or scene and the initial data collected. The prototype models which the system recovers compare well with the ideal models. © 2000 IEEE.}, bibtype = {article}, author = {Banta, Joseph E. and Wong, Laurana M. and Dumont, Christophe and Abidi, Mongi A.}, doi = {10.1109/3468.867866}, journal = {IEEE Transactions on Systems, Man, and Cybernetics Part A:Systems and Humans.}, number = {5} }
@article{ title = {Deformable Shape Completion with Graph Convolutional Autoencoders supplementary material}, type = {article}, year = {2000}, pages = {1-7}, id = {9b22563e-41d7-3d8e-91cb-451d1fe43d64}, created = {2021-08-29T22:46:33.426Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-13T14:40:20.614Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Litany, Or and Bronstein, Alex and Bronstein, Michael and Makadia, Ameesh} }
@article{ title = {The Earth Mover's Distance as a Metric for Image Retrieval}, type = {article}, year = {2000}, pages = {99-121}, volume = {40}, month = {11}, id = {090bd48c-aa44-34bc-9d94-1506dfcb7123}, created = {2022-03-28T09:45:03.594Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:51.358Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {rubnerEarthMoverDistance2000}, source_type = {article}, private_publication = {false}, abstract = {We investigate the properties of a metric between two distributions, the Earth Mover's Distance (EMD), for content-based image retrieval. The EMD is based on the minimal cost that must be paid to transform one distribution into the other, in a precise sense, and was first proposed for certain vision problems by Peleg, Werman, and Rom. For image retrieval, we combine this idea with a representation scheme for distributions that is based on vector quantization. This combination leads to an image comparison framework that often accounts for perceptual similarity better than other previously proposed methods. The EMD is based on a solution to the transportation problem from linear optimization, for which efficient algorithms are available, and also allows naturally for partial matching. It is more robust than histogram matching techniques, in that it can operate on variable-length representations of the distributions that avoid quantization and other binning problems typical of histograms. When used to compare distributions with the same overall mass, the EMD is a true metric. In this paper we focus on applications to color and texture, and we compare the retrieval performance of the EMD with that of other distances.}, bibtype = {article}, author = {Rubner, Yossi and Tomasi, Carlo and Guibas, Leonidas}, doi = {10.1023/A:1026543900054}, journal = {International Journal of Computer Vision} }
@article{ title = {A solution to the next best view problem for automated surface acquisition}, type = {article}, year = {1999}, keywords = {Active vision,Automated surface acquisition,Model acquisition,Next best view,Range imaging,Reverse engineering,Sensor planning}, pages = {1016-1030}, volume = {21}, id = {029a1872-5990-3d3b-9865-f92aa30be7aa}, created = {2021-02-09T17:05:47.049Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-03T10:14:33.319Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Pito1999}, folder_uuids = {5439d198-93d5-4603-a7ce-201d423f231e,30d6eff3-286d-4f81-8f91-64574a8fa4e9,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, abstract = {A solution to the next best view (NBV) problem for automated surface acquisition is presented. The NBV problem is to determine which areas of a scanner's viewing volume need to be scanned to sample all of the visible surfaces of an a priori unknown object and where to position/control the scanner to sample them. It is argued that solutions to the NBV problem are constrained by the other steps in a surface acquisition system and by the range scanner's particular sampling physics. A method for determining the unscanned areas of the viewing volume is presented. In addition, a novel representation, positional space (PS), is presented which facilitates a solution to the NBV problem by representing what must be and what can be scanned in a single data structure. The number of costly computations needed to determine if an area of the viewing volume would be occluded from some scanning position is decoupled from the number of positions considered for the NBV, thus reducing the computational cost of choosing one. An automated surface acquisition systems designed to scan all visible surfaces of an a priori unknown object is demonstrated on real objects. © 1999 IEEE.}, bibtype = {article}, author = {Pito, Richard}, doi = {10.1109/34.799908}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, number = {10} }
@article{ title = {Frontier-based approach for autonomous exploration}, type = {article}, year = {1997}, pages = {146-151}, id = {0edbe504-3eff-39d3-a829-aabc4977d90b}, created = {2021-02-09T17:05:47.178Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:00.788Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {100c3365-eb8c-439a-84c4-0ee26856acca}, private_publication = {false}, abstract = {We introduce a new approach for exploration based on the concept of frontiers, regions on the boundary between open space and unexplored space. By moving to new frontiers, a mobile robot can extend its map into new territory until the entire environment has been explored. We describe a method for detecting frontiers in evidence grids and navigating to these frontiers. We also introduce a technique for minimizing specular reflections in evidence grids using laser-limited sonar. We have tested this approach with a real mobile robot, exploring real-world office environments cluttered with a variety of obstacles. An advantage of our approach is its ability to explore both large open spaces and narrow cluttered spaces, with walls and obstacles in arbitrary orientations.}, bibtype = {article}, author = {Yamauchi, Brian}, doi = {10.1109/cira.1997.613851}, journal = {Proceedings of IEEE International Symposium on Computational Intelligence in Robotics and Automation, CIRA} }
@article{ title = {Long Short-Term Memory}, type = {article}, year = {1997}, pages = {1735-1780}, volume = {9}, id = {1646fb3e-34c1-3e6d-9944-f974a4e35e6d}, created = {2021-07-12T14:15:35.380Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T14:17:15.045Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {85ed9c29-c272-40dc-a01a-f912101de83a}, private_publication = {false}, abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient-based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O(1). Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.}, bibtype = {article}, author = {Hochreiter, Sepp and Schmidhuber, Jürgen}, doi = {10.1162/neco.1997.9.8.1735}, journal = {Neural Computation}, number = {8} }
@article{ title = {Long Short-Term Memory}, type = {article}, year = {1997}, pages = {1735-1780}, volume = {9}, month = {11}, id = {f8adee4e-2bf5-3903-b3ce-2dfa357655a5}, created = {2022-03-28T09:45:01.396Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:01:39.518Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {hochreiterLongShortTermMemory1997}, source_type = {article}, notes = {Conference Name: Neural Computation}, private_publication = {false}, abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.}, bibtype = {article}, author = {Hochreiter, Sepp and Schmidhuber, Jürgen}, doi = {10.1162/neco.1997.9.8.1735}, journal = {Neural Computation}, number = {8} }
@inproceedings{ title = {Learning and recognizing human dynamics in video sequences}, type = {inproceedings}, year = {1997}, keywords = {Context modeling,Delay,Hidden Markov models,Humans,Image segmentation,Leg,Motion detection,Speech recognition,Training data,Video sequences}, pages = {568-574}, month = {6}, id = {674caca6-26f2-3bb4-9a90-f2a4babafcfd}, created = {2022-03-28T09:45:03.778Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:56.361Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {breglerLearningRecognizingHuman1997}, source_type = {inproceedings}, notes = {ISSN: 1063-6919}, private_publication = {false}, abstract = {This paper describes a probabilistic decomposition of human dynamics at multiple abstractions, and shows how to propagate hypotheses across space, time, and abstraction levels. Recognition in this framework is the succession of very general low level grouping mechanisms to increased specific and learned model based grouping techniques at higher levels. Hard decision thresholds are delayed and resolved by higher level statistical models and temporal context. Low-level primitives are areas of coherent motion found by EM clustering, mid-level categories are simple movements represented by dynamical systems, and high-level complex gestures are represented by Hidden Markov Models as successive phases of ample movements. We show how such a representation can be learned from training data, and apply It to the example of human gait recognition.}, bibtype = {inproceedings}, author = {Bregler, C}, doi = {10.1109/CVPR.1997.609382}, booktitle = {Proceedings of IEEE Computer Society Conference on Computer Vision and Pattern Recognition} }
@inproceedings{ title = {Surface simplification using quadric error metrics}, type = {inproceedings}, year = {1997}, keywords = {level of detail,mutiresolution modeling,non-manifold,pair contraction,surface simplification}, pages = {209-216}, websites = {https://doi.org/10.1145/258734.258849}, month = {8}, publisher = {ACM Press/Addison-Wesley Publishing Co.}, city = {USA}, series = {SIGGRAPH '97}, id = {32033ba4-b3db-3e80-b8b2-d2f2daa095c6}, created = {2022-03-28T09:45:06.024Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:23:38.248Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {garlandSurfaceSimplificationUsing1997}, source_type = {inproceedings}, private_publication = {false}, bibtype = {inproceedings}, author = {Garland, Michael and Heckbert, Paul S}, doi = {10.1145/258734.258849}, booktitle = {Proceedings of the 24th annual conference on Computer graphics and interactive techniques} }
@article{ title = {A sensor-based solution to the "next best view" problem}, type = {article}, year = {1996}, pages = {941-945}, volume = {1}, id = {fd276aa8-afc0-3c24-b5c4-36e32c72f986}, created = {2021-02-09T17:05:47.142Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-29T15:21:59.232Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {123597df-4c66-4b78-91d9-86570b388419}, private_publication = {false}, abstract = {Acquiring the complete surface geometry of an object using a range scanner invariably requires that multiple range images be taken of it from different viewpoints. An algorithm is presented which solves the "next best view" (NBV) problem: determine the next position for the range scanner given its previous scans of the object. As part of a complete surface acquisition system the scanner's next position should cause it not only to sample more of the object's surface but to resample part of the object already scanned to allow for the registration and integration of the new data with the previous scans. A novel representation, positional space, is presented which facilitates a solution to the NBV problem by representing what must be and what can be scanned in a unified data structure. The expensive operation of determining the visibility of part of the viewing volume is computed only once, not for each potential position of the scanner thus breaking the computational burden of choosing the NBV from a large number of positions. No assumptions are made about the geometry or topology of the object. The algorithm is self-terminating will scan all visible surfaces of an object and can be directed to resample surfaces which were scanned with low confidence. In addition, the algorithm will work with nearly any range camera and scanning setup. A completely automated surface acquisition system featuring the proposed NBV algorithm is demonstrated on a real object. © 1996 IEEE.}, bibtype = {article}, author = {Pito, Richard}, doi = {10.1109/ICPR.1996.546162}, journal = {Proceedings - International Conference on Pattern Recognition} }
@article{ title = {A Survey of Sensor Planning in Computer Vision}, type = {article}, year = {1995}, pages = {86-104}, volume = {11}, id = {428ca4ce-0bb8-37f5-8dc4-966735530030}, created = {2021-02-09T17:05:47.216Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-12-05T14:13:53.215Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Tarabanis1995}, folder_uuids = {7e061766-cf31-4201-b983-d0f31153c02e}, private_publication = {false}, abstract = {A survey of research in the area of vision sensor planning is presented. The problem can be summarized as follows: Given information about the environment (e.g., the object under observation, the available sensors) as well as information about the task that the vision system is to accomplish (i.e., detection of certain object features, object recognition, scene reconstruction, object manipulation), develop strategies to automatically determine sensor parameter values that achieve this task with a certain degree of satisfaction. With such strategies, sensor parameters values can be selected and can be purposefully changed in order to effectively perform the task at hand. Sensory systems are then able to operate more flexibly, autonomously, and reliably. This problem has recently become an active area of study with a number of researchers addressing various aspects of the problem. The focus here is on vision sensor planning for the task of robustly detecting object features. For this task, camera and illumination parameters such as position, orientation, and optical settings are determined so that object features are, for example, visible, in focus, within the sensor field of view, magnified as required, and imaged with sufficient contrast. References to, and a brief description of, representative sensing strategies for the tasks of object recognition and scene reconstruction are also presented. For these tasks, sensor configurations are sought that will prove most useful when trying to identify an object or reconstruct a scene. © 1995 IEEE}, bibtype = {article}, author = {Tarabanis, Konstantinos A. and Allen, Peter K.}, doi = {10.1109/70.345940}, journal = {IEEE Transactions on Robotics and Automation}, number = {1} }
@article{ title = {The Helmholtz Machine}, type = {article}, year = {1995}, pages = {889-904}, volume = {7}, websites = {https://doi.org/10.1162/neco.1995.7.5.889}, month = {9}, id = {a504df26-d4d4-3056-8201-3039c379ac11}, created = {2022-03-28T09:45:02.912Z}, accessed = {2022-03-26}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:04:08.522Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {dayanHelmholtzMachine1995}, source_type = {article}, private_publication = {false}, abstract = {Discovering the structure inherent in a set of patterns is a fundamental aim of statistical inference or learning. One fruitful approach is to build a parameterized stochastic generative model, independent draws from which are likely to produce the patterns. For all but the simplest generative models, each pattern can be generated in exponentially many ways. It is thus intractable to adjust the parameters to maximize the probability of the observed patterns. We describe a way of finessing this combinatorial explosion by maximizing an easily computed lower bound on the probability of the observations. Our method can be viewed as a form of hierarchical self-supervised learning that may relate to the function of bottom-up and top-down cortical processing pathways.}, bibtype = {article}, author = {Dayan, Peter and Hinton, Geoffrey E and Neal, Radford M and Zemel, Richard S}, doi = {10.1162/neco.1995.7.5.889}, journal = {Neural Computation}, number = {5} }
@article{ title = {The "wake-sleep" algorithm for unsupervised neural networks}, type = {article}, year = {1995}, pages = {1158-1161}, volume = {268}, websites = {https://www.sciencemag.org/lookup/doi/10.1126/science.7761831}, month = {5}, id = {1da58d7f-dd58-3089-8e04-e12da343b4bd}, created = {2022-03-28T09:45:03.899Z}, accessed = {2021-09-28}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:05:59.683Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {hintonWakesleepAlgorithmUnsupervised1995}, source_type = {article}, private_publication = {false}, abstract = {An unsupervised learning algorithm for a multilayer network of stochastic neurons is described. Bottom-up “recognition” connections convert the input into representations in successive hidden layers and top-down “generative” connections reconstruct the representation in one layer from the representation in the layer above. In the “wake” phase, neurons are driven by recognition connections, and generative connections are adapted to increase the probability that they would reconstruct the correct activity vector in the layer below. In the “sleep” phase, neurons are driven by generative connections and recognition connections are adapted to increase the probability that they would produce the correct activity vector in the layer above.}, bibtype = {article}, author = {Hinton, G and Dayan, P and Frey, B and Neal, R}, doi = {10.1126/science.7761831}, journal = {Science}, number = {5214} }
@article{ title = {US005283837A}, type = {article}, year = {1994}, id = {6348a18b-7429-3ad2-a214-e15f221ecbee}, created = {2021-04-15T08:24:37.789Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T08:24:41.568Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ffa7aa64-dc15-4667-8778-6ff9b9800bbb}, private_publication = {false}, bibtype = {article}, author = {Surface, Estimation O F}, number = {19} }
@article{ title = {Occlusions as a Guide for Planning the Next View}, type = {article}, year = {1993}, volume = {15}, id = {376f4249-7ea0-3c28-8e45-a947b97612aa}, created = {2021-02-09T17:05:47.236Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-12-05T14:13:52.421Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {5439d198-93d5-4603-a7ce-201d423f231e,9d71d6f2-c899-455e-bae9-68567386e977}, private_publication = {false}, bibtype = {article}, author = {Maver, Jasna and Bajcsy, Ruzena}, number = {5} }
@book{ title = {Statistical Analysis of Spherical Data}, type = {book}, year = {1993}, keywords = {Mathematics / Probability \& Statistics / General,Science / Physics / Astrophysics,Science / Space Science / Astronomy}, month = {8}, publisher = {Cambridge University Press}, id = {773333ce-7e56-3de7-9706-334f8220afcc}, created = {2022-03-28T09:45:05.800Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:05.800Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {fisherStatisticalAnalysisSpherical1993}, source_type = {book}, notes = {Google-Books-ID: yK4Quuq9tRgC}, private_publication = {false}, abstract = {This is the first comprehensive, yet clearly presented, account of statistical methods for analysing spherical data. The analysis of data, in the form of directions in space or of positions of points on a spherical surface, is required in many contexts in the earth sciences, astrophysics and other fields, yet the methodology required is disseminated throughout the literature. Statistical Analysis of Spherical Data aims to present a unified and up-to-date account of these methods for practical use. The emphasis is on applications rather than theory, with the statistical methods being illustrated throughout the book by data examples.}, bibtype = {book}, author = {Fisher, N I and Lewis, T and Embleton, B J J} }
@article{ title = {Surface reconstruction from unorganized points}, type = {article}, year = {1992}, pages = {71-78}, volume = {26}, id = {2352009b-57d1-3d7c-9a42-a24bd2b21542}, created = {2021-10-15T12:44:59.955Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-15T12:45:06.752Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, abstract = {We describe and demonstrate an algorithm that takes as input an unorganized set of points X1,...,nRTBC ⊂ IR3 on or near an unknown manifold M, and produces as output a simplicial surface that approximates M. Neither the topology, the presence of boundaries, nor the geometry of M are assumed to be known in advance - all are inferred automatically from the data. This problem naturally arises in a variety of practical situations such as range scanning an object from multiple view points, recovery of biological shapes from two-dimensional slices, and interactive surface sketching.}, bibtype = {article}, author = {Hoppe, Hugues and DeRose, Tony and Duchamp, Tom and McDonald, John and Stuetzle, Werner}, doi = {10.1145/142920.134011}, journal = {Computer Graphics (ACM)}, number = {2} }
@article{ title = {Nonlinear principal component analysis using autoassociative neural networks}, type = {article}, year = {1991}, pages = {233-243}, volume = {37}, websites = {https://onlinelibrary.wiley.com/doi/abs/10.1002/aic.690370209}, id = {38266409-055c-3002-a8ff-8d70864bd79b}, created = {2022-03-28T09:45:04.380Z}, accessed = {2021-09-23}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:13.711Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kramerNonlinearPrincipalComponent1991}, source_type = {article}, notes = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1002/aic.690370209}, private_publication = {false}, abstract = {Nonlinear principal component analysis is a novel technique for multivariate data analysis, similar to the well-known method of principal component analysis. NLPCA, like PCA, is used to identify and remove correlations among problem variables as an aid to dimensionality reduction, visualization, and exploratory data analysis. While PCA identifies only linear correlations between variables, NLPCA uncovers both linear and nonlinear correlations, without restriction on the character of the nonlinearities present in the data. NLPCA operates by training a feedforward neural network to perform the identity mapping, where the network inputs are reproduced at the output layer. The network contains an internal “bottleneck” layer (containing fewer nodes than input or output layers), which forces the network to develop a compact representation of the input data, and two additional hidden layers. The NLPCA method is demonstrated using time-dependent, simulated batch reaction data. Results show that NLPCA successfully reduces dimensionality and produces a feature space map resembling the actual distribution of the underlying system parameters.}, bibtype = {article}, author = {Kramer, Mark A}, doi = {10.1002/aic.690370209}, journal = {AIChE Journal}, number = {2} }
@inproceedings{ title = {An Implementation of the “algorithme à trous” to Compute the Wavelet Transform}, type = {inproceedings}, year = {1990}, keywords = {Analyse Wavelet,Digital Signal Processor,Finite Impulse Response,Finite Impulse Response Filter,Side Lobe}, pages = {298-304}, publisher = {Springer}, city = {Berlin, Heidelberg}, series = {inverse problems and theoretical imaging}, id = {5914c95f-5b85-3efe-91be-2782794623ef}, created = {2022-03-28T09:45:04.708Z}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:04.708Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {dutilleuxImplementationAlgorithmeTrous1990}, source_type = {inproceedings}, private_publication = {false}, abstract = {The computation of the wavelet transform involves the computation of the convolution product of the signal to be analysed by the analysing wavelet. It will be shown that the computation load grows with the scale factor of the analysis. We are interested in musical sounds lasting a few seconds. Using a straightforward algorithm leads to a prohibitive computation time, so we need a more effective computation procedure.}, bibtype = {inproceedings}, author = {Dutilleux, P}, editor = {Combes, Jean-Michel and Grossmann, Alexander and Tchamitchian, Philippe}, doi = {10.1007/978-3-642-75988-8_29}, booktitle = {Wavelets} }
@article{ title = {The multilayer perceptron as an approximation to a Bayes optimal discriminant function}, type = {article}, year = {1990}, keywords = {Backpropagation,Bayesian methods,Books,Image analysis,Multi-layer neural network,Multilayer perceptrons,Neural networks,Pattern recognition,Probability density function}, pages = {296-298}, volume = {1}, month = {12}, id = {b53eb762-6432-35f8-99e9-d1605f6ec132}, created = {2022-03-28T09:45:06.402Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T07:59:13.588Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {ruckMultilayerPerceptronApproximation1990}, source_type = {article}, notes = {Conference Name: IEEE Transactions on Neural Networks}, private_publication = {false}, abstract = {The multilayer perceptron, when trained as a classifier using backpropagation, is shown to approximate the Bayes optimal discriminant function. The result is demonstrated for both the two-class problem and multiple classes. It is shown that the outputs of the multilayer perceptron approximate the a posteriori probability functions of the classes being trained. The proof applies to any number of layers and any type of unit activation function, linear or nonlinear.\textless\textgreater}, bibtype = {article}, author = {Ruck, D W and Rogers, S K and Kabrisky, M and Oxley, M E and Suter, B W}, doi = {10.1109/72.80266}, journal = {IEEE Transactions on Neural Networks}, number = {4} }
@inbook{ type = {inbook}, year = {1987}, pages = {661-670}, websites = {https://www.sciencedirect.com/science/article/pii/B9780080515816500647}, month = {1}, publisher = {Morgan Kaufmann}, city = {San Francisco (CA)}, id = {00cf7fe6-2e19-3448-a17e-4a24c7a2d553}, created = {2022-03-28T09:45:04.775Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T08:07:53.493Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {barrGLOBALLOCALDEFORMATIONS1987}, source_type = {incollection}, private_publication = {false}, abstract = {New hierarchical solid modeling operations are developed, which simulate twisting, bending, tapering, or similar transformations of geometric objects. The chief result is that the normal vector of an arbitrarily deformed smooth surface can be calculated directly from the surface normal vector of the undeformed surface and a transformation matrix. Deformations are easily combined in a hierarchical structure, creating complex objects from simpler ones. The position vectors and normal vectors in the simpler objects are used to calculate the position and normal vectors in the more complex forms; each level in the deformation hierarchy requires an additional matrix multiply for the normal vector calculation. Deformations are important and highly intuitive operations which ease the control and rendering of large families of three-dimensional geometric shapes.}, bibtype = {inbook}, author = {Barr, Alan H}, editor = {Fischler, Martin A and Firschein, Oscar}, doi = {10.1016/B978-0-08-051581-6.50064-7}, chapter = {GLOBAL AND LOCAL DEFORMATIONS OF SOLID PRIMITIVES}, title = {Readings in Computer Vision} }
@article{ title = {The Determination of Next Best Views}, type = {article}, year = {1985}, pages = {432-435}, id = {f1edb373-9a76-3345-b924-16e061400395}, created = {2021-02-09T17:05:47.242Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.270Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {C.I.Connolly1985}, folder_uuids = {07e07de9-bcac-4934-a82b-d0aff540e56d,8aa7f6da-86cb-4048-9f9d-473281b36a98}, private_publication = {false}, bibtype = {article}, author = {C.I.Connolly, undefined} }
@article{ title = {The determination elastin}, type = {article}, year = {1985}, pages = {432-435}, id = {bd04280b-9eb8-37d9-a04c-0b858d59201c}, created = {2021-02-09T17:05:47.258Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T17:06:12.893Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Logan, A} }
@article{ title = {Generalized procrustes analysis}, type = {article}, year = {1975}, pages = {33-51}, volume = {40}, websites = {https://doi.org/10.1007/BF02291478}, month = {3}, id = {09fed2a6-7677-3c42-9f21-3a526fbd5b7e}, created = {2022-03-28T09:45:05.222Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-30T07:21:46.138Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {gowerGeneralizedProcrustesAnalysis1975}, source_type = {article}, private_publication = {false}, abstract = {SupposePi(i)(i = 1, 2, ...,m, j = 1, 2, ...,n) give the locations ofmn points inp-dimensional space. Collectively these may be regarded asm configurations, or scalings, each ofn points inp-dimensions. The problem is investigated of translating, rotating, reflecting and scaling them configurations to minimize the goodness-of-fit criterion Σi=1mΣi=1nΔ2(Pj(i)Gi), whereGiis the centroid of them pointsPi(i)(i = 1, 2, ...,m). The rotated positions of each configuration may be regarded as individual analyses with the centroid configuration representing a consensus, and this relationship with individual scaling analysis is discussed. A computational technique is given, the results of which can be summarized in analysis of variance form. The special casem = 2 corresponds to Classical Procrustes analysis but the choice of criterion that fits each configuration to the common centroid configuration avoids difficulties that arise when one set is fitted to the other, regarded as fixed.}, bibtype = {article}, author = {Gower, J C}, doi = {10.1007/BF02291478}, journal = {Psychometrika}, number = {1} }
@article{ title = {On Information and Sufficiency}, type = {article}, year = {1951}, pages = {79-86}, volume = {22}, websites = {https://www.jstor.org/stable/2236703}, id = {6ac6c018-48cb-373c-8f4f-9944bb337958}, created = {2022-03-28T09:45:05.934Z}, accessed = {2021-09-23}, file_attached = {false}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:05.934Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {kullbackInformationSufficiency1951}, source_type = {article}, notes = {Publisher: Institute of Mathematical Statistics}, private_publication = {false}, bibtype = {article}, author = {Kullback, S and Leibler, R A}, journal = {The Annals of Mathematical Statistics}, number = {1} }
@misc{ title = {VOLUMNECT - Measuring Volumes with Kinect T M - PDF Free Download.pdf}, type = {misc}, id = {1d754198-cfd6-3158-97a2-4a322be21c51}, created = {2020-09-14T08:14:53.703Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-21T12:07:28.501Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {c41ac501-6dd3-4d6f-b177-cdc2b43ddc1f,16688d52-1757-4ef4-badb-f53b700252a9,591145a4-49d3-4baf-a2cc-a1f3832f0e3e,4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, bibtype = {misc}, author = {} }
@article{ title = {Tackling 3D ToF Artifacts Through Learning and the FLAT Dataset - Supplementary}, type = {article}, pages = {1-10}, id = {4f8bd10a-618d-3b17-a1f2-046a5c8a0325}, created = {2020-10-01T06:44:41.765Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-10T07:17:52.055Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47,019ca26f-d15a-40ee-aa8d-7a0fbf949316}, private_publication = {false}, bibtype = {article}, author = {Guo, Qi and Frosio, Iuri and Gallo, Orazio and Zickler, Todd and Kautz, Jan}, number = {4} }
@article{ title = {Recognition of 3D Package Shapes for Single Camera Metrology}, type = {article}, pages = {99-106}, publisher = {IEEE}, id = {8a67444f-5fe0-3413-929f-bb4ac84d13c7}, created = {2020-10-05T10:26:00.899Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-27T07:13:11.204Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {591145a4-49d3-4baf-a2cc-a1f3832f0e3e}, private_publication = {false}, bibtype = {article}, author = {Lloyd, Ryan and Mccloskey, Scott} }
@article{ title = {JSIS3D: Joint Semantic-Instance Segmentation of 3D Point Clouds with Multi-Task Pointwise Networks and Multi-Value Conditional Random Fields}, type = {article}, id = {26866c0f-e4df-368f-a164-271fdfe577fd}, created = {2020-10-20T09:48:06.313Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-04T07:03:18.340Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Networks}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Networks, Multi-task Pointwise and Fields, Multi-value Conditional Random} }
@article{ title = {Joint 2D-3D-Semantic Data for Indoor Scene Understanding}, type = {article}, id = {88bd44bc-f555-39af-886d-f7a4bfc13f80}, created = {2020-10-20T09:48:06.317Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-02T09:28:03.722Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Armeni, Iro and Sax, Alexander and Zamir, Amir R and Savarese, Silvio} }
@article{ title = {Real-time Fusion Network for RGB-D Semantic Segmentation Incorporating Unexpected Obstacle Detection for Road-driving Images}, type = {article}, id = {0f342513-1ed2-3bf0-a565-5856b962ab16}, created = {2020-10-20T09:48:06.317Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:09:05.330Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Sun, Lei and Yang, Kailun and Hu, Xinxin and Hu, Weijian and Wang, Kaiwei} }
@article{ title = {Kaolin : A PyTorch Library for Accelerating 3D Deep Learning Research}, type = {article}, pages = {1-7}, id = {fb567bbe-2ca1-3d27-b676-82c87472e4f5}, created = {2020-10-20T09:48:06.324Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-28T13:47:21.843Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {J, Krishna Murthy and Smith, Edward and Lafleche, Jean-francois and Tsang, Clement Fuji and Chen, Wenzheng and Xiang, Tommy and Lebaredian, Rev and Fidler, Sanja} }
@article{ title = {PointNet: Deep Learning on Point Sets for 3D Classification and Segmentation}, type = {article}, id = {0d92a4f2-0e9a-39d0-811b-6a11b06ef813}, created = {2020-10-20T09:48:06.421Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:00:50.995Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Qi, Charles R} }
@article{ title = {Real-time Progressive 3D Semantic Segmentation for Indoor Scenes}, type = {article}, id = {ae0fa301-277e-3715-9ee8-b31ca3b04548}, created = {2020-10-20T09:48:06.433Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-09T08:00:51.148Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Pham}, folder_uuids = {dc009c1c-5c21-43bd-9c8a-d37db3983b2e,a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, bibtype = {article}, author = {Pham, Quang-hieu and Hua, Binh-son} }
@article{ title = {Torch-Points3D : A Modular Multi-Task Framework for Reproducible Deep Learning on 3D Point Clouds}, type = {article}, id = {b2271f6c-63db-3351-ae1d-c470e77d5375}, created = {2020-10-20T09:48:06.436Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-21T06:44:17.377Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c,e523c16b-0594-4b52-9c4e-9052fcb9dbed}, private_publication = {false}, bibtype = {article}, author = {Chaton, Thomas and Chaulet, Nicolas and Horache, Sofiane} }
@article{ title = {Denoising 3D Time-Of-Flight Data}, type = {article}, keywords = {deep learning,multi-path interference,time-of-flight}, id = {b894dd2e-5b4c-30e6-a01e-a8f466026826}, created = {2020-10-22T06:49:09.810Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-10-27T06:19:54.167Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {2a0475f2-facb-4360-917f-00c5f8541f47}, private_publication = {false}, bibtype = {article}, author = {Gupta, Kapil and Xu, Yanwen} }
@article{ title = {3D Point Cloud Classification , Segmentation , and Normal estimation using Modified Fisher Vector and CNNs}, type = {article}, id = {7d850968-e89b-379d-8508-5bec48569699}, created = {2020-11-09T09:00:51.420Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2020-11-09T09:01:05.545Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, bibtype = {article}, author = {Ben-shabat, Yizhak Itzik} }
@article{ title = {eitelIROS15}, type = {article}, id = {c756c706-1f01-3fc4-88d8-5557d4b894bc}, created = {2020-11-16T11:56:20.680Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-02-24T11:29:16.794Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f3937f49-b3bb-4fad-a335-4fb2125beca8,71ca8421-f528-4caf-a342-3c1291372174}, private_publication = {false}, bibtype = {article}, author = {Eitel, Andreas and Spinello, Luciano and Riedmiller, Martin} }
@techreport{ title = {Multi-view Convolutional Neural Networks for 3D Shape Recognition}, type = {techreport}, websites = {http://vis-www.cs.umass.edu/mvcnn.}, id = {c2ba0f03-e523-35b0-afb8-487c7a94f957}, created = {2021-01-27T10:23:37.061Z}, accessed = {2021-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T10:23:42.352Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {A longstanding question in computer vision concerns the representation of 3D shapes for recognition: should 3D shapes be represented with descriptors operating on their native 3D formats, such as voxel grid or polygon mesh, or can they be effectively represented with view-based descrip-tors? We address this question in the context of learning to recognize 3D shapes from a collection of their rendered views on 2D images. We first present a standard CNN architecture trained to recognize the shapes' rendered views independently of each other, and show that a 3D shape can be recognized even from a single view at an accuracy far higher than using state-of-the-art 3D shape descriptors. Recognition rates further increase when multiple views of the shapes are provided. In addition, we present a novel CNN architecture that combines information from multiple views of a 3D shape into a single and compact shape de-scriptor offering even better recognition performance. The same architecture can be applied to accurately recognize human hand-drawn sketches of shapes. We conclude that a collection of 2D views can be highly informative for 3D shape recognition and is amenable to emerging CNN archi-tectures and their derivatives.}, bibtype = {techreport}, author = {Su, Hang and Maji, Subhransu and Kalogerakis, Evangelos and Learned-Miller, Erik} }
@techreport{ title = {Deep Learning for Generic Object Detection: A Survey}, type = {techreport}, id = {c6e8003a-01e0-39fa-aca2-62e50cdec8c8}, created = {2021-01-27T11:38:08.543Z}, accessed = {2021-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T11:38:13.875Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Object detection, one of the most fundamental and challenging problems in computer vision, seeks to locate object instances from a large number of predefined categories in natural images. Deep learning techniques have emerged as a powerful strategy for learning feature representations directly from data and have led to remarkable breakthroughs in the field of generic object detection. Given this period of rapid evolution, the goal of this paper is to provide a comprehensive survey of the recent achievements in this field brought about by deep learning techniques. More than 300 research contributions are included in this survey, covering many aspects of generic object detection: detection frameworks, object feature representation, object proposal generation, context modeling, training strategies, and evaluation metrics. We finish the survey by identifying promising directions for future research.}, bibtype = {techreport}, author = {Liu, Li and Ouyang, Wanli and Wang, · Xiaogang and Fieguth, Paul and Chen, · Jie and Liu, · Xinwang and Pietikäinen, Matti} }
@techreport{ title = {VoxSegNet: Volumetric CNNs for Semantic Part Segmentation of 3D Shapes}, type = {techreport}, keywords = {Index Terms-shape analysis,convolutional neural networks,semantic segmentation,volumetric models !}, id = {be6f7e20-ca8c-37e9-a66d-7dc31e62f626}, created = {2021-01-27T11:46:09.349Z}, accessed = {2021-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-01-27T11:46:14.184Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Voxel is an important format to represent geometric data, which has been widely used for 3D deep learning in shape analysis due to its generalization ability and regular data format. However, fine-grained tasks like part segmentation require detailed structural information, which increases voxel resolution and thus causes other issues such as the exhaustion of computational resources. In this paper, we propose a novel volumetric convolutional neural network, which could extract discriminative features encoding detailed information from voxelized 3D data under a limited resolution. To this purpose, a spatial dense extraction (SDE) module is designed to preserve the spatial resolution during the feature extraction procedure, alleviating the loss of detail caused by sub-sampling operations such as max-pooling. An attention feature aggregation (AFA) module is also introduced to adaptively select informative features from different abstraction scales, leading to segmentation with both semantic consistency and high accuracy of details. Experiment results on the large-scale dataset demonstrate the effectiveness of our method in 3D shape part segmentation.}, bibtype = {techreport}, author = {Wang, Zongji and Lu, Feng} }
@article{ title = {Self-Supervised Deep Depth Denoising ( Supplementary Material )}, type = {article}, pages = {1242-1251}, id = {1a0d9582-8b23-39f5-878a-53a57ab25eac}, created = {2021-03-08T09:43:04.058Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-03-09T06:38:49.786Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, bibtype = {article}, author = {Sterzentsenko, Vladimiros and Saroglou, Leonidas and Zioulis, Nikolaos} }
@article{ title = {Point Cloud Noise and Outlier Removal for Image-Based 3D Reconstruction}, type = {article}, id = {09f942f7-7c2a-31c5-a6c8-ab93a424a30f}, created = {2021-03-09T06:55:59.692Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-16T06:42:30.139Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, abstract = {Point sets generated by image-based 3D reconstruction techniques are often much noisier than those obtained using active techniques like laser scanning. Therefore, they pose greater challenges to the subsequent surface reconstruction (meshing) stage. We present a simple and effective method for removing noise and outliers from such point sets. Our algorithm uses the input images and corresponding depth maps to remove pixels which are geometrically or photo-metrically inconsistent with the colored surface implied by the input. This allows standard surface reconstruction methods (such as Poisson surface reconstruction) to perform less smoothing and thus achieve higher quality surfaces with more features. Our algorithm is efficient, easy to implement, and robust to varying amounts of noise. We demonstrate the benefits of our algorithm in combination with a variety of state-of-the-art depth and surface reconstruction methods.}, bibtype = {article}, author = {Wolff, Katja and Kim, Changil and Zimmer, Henning and Schroers, Christopher and Botsch, Mario and Alexander, Olga Sorkine-hornung} }
@article{ title = {Robust Unsupervised Cleaning of Underwater Bathymetric Point Cloud Data}, type = {article}, pages = {1-14}, id = {5a8164d4-2101-3961-b83e-d1ec8247032f}, created = {2021-04-15T14:18:52.514Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-04-15T14:19:12.689Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, bibtype = {article}, author = {Chen, Cong} }
@techreport{ title = {Workshop track-ICLR 2016 RESNET IN RESNET: GENERALIZING RESIDUAL ARCHITECTURES}, type = {techreport}, id = {27cb78ac-b3d7-31c6-b177-160820e7bb33}, created = {2021-05-06T06:59:22.407Z}, accessed = {2021-05-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-05-06T06:59:25.374Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a89f4866-a7e8-4ea9-aa98-e3f470892f7c}, private_publication = {false}, abstract = {Residual networks (ResNets) have recently achieved state-of-the-art on challenging computer vision tasks. We introduce Resnet in Resnet (RiR): a deep dual-stream architecture that generalizes ResNets and standard CNNs and is easily implemented with no computational overhead. RiR consistently improves performance over ResNets, outperforms architectures with similar amounts of augmentation on CIFAR-10, and establishes a new state-of-the-art on CIFAR-100.}, bibtype = {techreport}, author = {Targ, Sasha and Almeida, Diogo and Enlitic, Kevin Lyman} }
@article{ title = {Image Restoration Using Convolutional Auto-encoders with Symmetric Skip Connections}, type = {article}, pages = {1-17}, id = {ca77501c-d35c-3267-8b21-68c6fa3d7bfb}, created = {2021-05-28T06:52:11.096Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-05-28T06:52:15.904Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, bibtype = {article}, author = {Mao, Xiao-jiao and Shen, Chunhua and Yang, Yu-bin} }
@article{ title = {About the Application of Autoencoders for Visual Defect}, type = {article}, keywords = {autoencoder neural network,convolutional neural network,defect detection,unsupervised anomaly detection}, id = {c8e49170-b1a4-312f-b771-1e1dab4a5ac7}, created = {2021-06-03T05:27:25.377Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-06-03T05:27:29.419Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ff4ee14e-f1de-4f3e-889c-95d41b8d7277}, private_publication = {false}, bibtype = {article}, author = {Egyetem, Pannonia and Egyetem, Pannonia} }
@article{ title = {Neural Message Passing for Quantum Chemistry}, type = {article}, id = {5485b8a9-d712-3fde-96f5-d6ce6630e560}, created = {2021-07-12T09:25:31.789Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-20T05:23:00.110Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {article}, author = {Gilmer, Justin and Schoenholz, Samuel S and Riley, Patrick F and Vinyals, Oriol and Dahl, George E} }
@article{ title = {Relational inductive biases , deep learning , and graph networks}, type = {article}, pages = {1-40}, id = {02d89e25-2943-31d7-8971-6654efdabcc8}, created = {2021-07-12T09:25:31.888Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T09:25:50.485Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {article}, author = {Hamrick, Jessica B and Bapst, Victor and Sanchez-gonzalez, Alvaro and Zambaldi, Vinicius and Malinowski, Mateusz and Tacchetti, Andrea and Raposo, David and Santoro, Adam and Faulkner, Ryan and Gulcehre, Caglar and Song, Francis and Ballard, Andrew and Gilmer, Justin and Dahl, George and Vaswani, Ashish and Allen, Kelsey and Nash, Charles and Langston, Victoria and Dyer, Chris and Heess, Nicolas and Wierstra, Daan and Kohli, Pushmeet and Botvinick, Matt} }
@article{ title = {Interaction Networks for Learning about Objects , Relations and Physics arXiv : 1612 . 00222v1 [ cs . AI ] 1 Dec 2016}, type = {article}, id = {be263aeb-59ca-3182-acf2-560b74d28ed4}, created = {2021-07-12T09:25:31.894Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-12T10:19:38.475Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {20ccb950-fef9-4ee1-800c-a60ba9f1df16,70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {article}, author = {Battaglia, Peter W and Lai, Matthew} }
@article{ title = {How to use Machine Learning models to build training sets?}, type = {article}, id = {ace67bd3-f22d-3f45-b4a2-000d87f9cd02}, created = {2021-07-19T10:48:23.780Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:48:55.810Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, bibtype = {article}, author = {Rybak, Piotr} }
@article{ title = {Self-supervised learning of visual representations from video and natural language Josef Šivic Visual recognition}, type = {article}, id = {4529fa44-133d-3581-967d-88cfd60ae9eb}, created = {2021-07-19T10:48:23.781Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:49:02.800Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, bibtype = {article}, author = {} }
@article{ title = {" I : aa}, type = {article}, id = {24743e81-62ed-38bc-975b-1d5e9fb15e5e}, created = {2021-07-19T10:48:23.902Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:48:31.362Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, bibtype = {article}, author = {Fan, R} }
@article{ title = {Understanding Generalization in Deep Learning Gintar e}, type = {article}, id = {de60afbf-d49a-3f30-81d0-475783f452a6}, created = {2021-07-19T10:48:23.913Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T10:49:07.671Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {40817151-8323-4487-8c1e-fe067729f714}, private_publication = {false}, bibtype = {article}, author = {Dˇ, Karolina} }
@article{ title = {IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE 1 Deep Learning for 3D Point Clouds: A Survey}, type = {article}, keywords = {3D data,Index Terms-deep learning,instance segmentation,object detection,object tracking,part segmentation,point clouds,scene flow,semantic segmentation,shape classification,shape retrieval}, websites = {https://github.com/QingyongHu/SoTA-Point-Cloud.}, id = {9bb1073b-b03d-3337-80d2-6176f3a2da56}, created = {2021-07-19T14:47:05.908Z}, accessed = {2021-07-19}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-19T14:47:09.175Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {5cd4d7ce-c2fb-4e91-ab80-35deeb123df5}, private_publication = {false}, abstract = {Point cloud learning has lately attracted increasing attention due to its wide applications in many areas, such as computer vision, autonomous driving, and robotics. As a dominating technique in AI, deep learning has been successfully used to solve various 2D vision problems. However, deep learning on point clouds is still in its infancy due to the unique challenges faced by the processing of point clouds with deep neural networks. Recently, deep learning on point clouds has become even thriving, with numerous methods being proposed to address different problems in this area. To stimulate future research, this paper presents a comprehensive review of recent progress in deep learning methods for point clouds. It covers three major tasks, including 3D shape classification, 3D object detection and tracking, and 3D point cloud segmentation. It also presents comparative results on several publicly available datasets, together with insightful observations and inspiring future research directions.}, bibtype = {article}, author = {Guo, Yulan and Wang, Hanyun and Hu, Qingyong and Liu, Hao and Liu, Li and Bennamoun, Mohammed} }
@article{ title = {Self-Attention Generative Adversarial Networks}, type = {article}, id = {ece06b85-e0dd-38e9-b3e0-0abaf9635970}, created = {2021-07-21T12:59:07.808Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-07-26T12:19:40.290Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Zhang}, folder_uuids = {4f36a0a5-b08a-4f70-b020-4daf83cb0507}, private_publication = {false}, bibtype = {article}, author = {Zhang, Han and Goodfellow, Ian and Metaxas, Dimitris and Odena, Augustus} }
@misc{ title = {Dynamic Convolution for 3D Point Cloud Instance Segmentation. - Prophy}, type = {misc}, websites = {https://www.prophy.science/article/144641825}, id = {45f22322-e34a-33b9-a69d-2de8fd9feec1}, created = {2021-08-24T10:25:36.974Z}, accessed = {2021-08-24}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-24T10:25:37.130Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, bibtype = {misc}, author = {} }
@article{ title = {Exploring Self-attention for Image Recognition}, type = {article}, id = {afdd3d95-dc0f-3d51-932d-5b40904a1431}, created = {2021-08-26T06:14:59.671Z}, accessed = {2021-08-26}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-08-26T06:15:02.197Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7}, private_publication = {false}, abstract = {Recent work has shown that self-attention can serve as a basic building block for image recognition models. We explore variations of self-attention and assess their effectiveness for image recognition. We consider two forms of self-attention. One is pairwise self-attention, which generalizes standard dot-product attention and is fundamentally a set operator. The other is patchwise self-attention, which is strictly more powerful than convolution. Our pairwise self-attention networks match or outperform their convolu-tional counterparts, and the patchwise models substantially outperform the convolutional baselines. We also conduct experiments that probe the robustness of learned representations and conclude that self-attention networks may have significant benefits in terms of robustness and generalization.}, bibtype = {article}, author = {Zhao, Hengshuang and Jiaya, Cuhk and Cuhk, Jia and Koltun, Vladlen} }
@article{ title = {Improving Variational Auto-Encoders using Householder Flow}, type = {article}, volume = {2}, id = {6d4ab5db-0e96-3576-91d8-50b207e5b67e}, created = {2021-08-30T18:48:39.131Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.971Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Tomczak}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Tomczak, Jakub M and Welling, Max} }
@article{ title = {Variational Graph Auto-Encoders}, type = {article}, pages = {1-3}, id = {1a20ef8f-bc25-3804-a282-5bda5814d4c3}, created = {2021-08-30T18:48:39.131Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.096Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Auto-encodersa}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Auto-encoders, Variational Graph and Kipf, Thomas N and Welling, Max}, number = {2} }
@article{ title = {Variational Autoencoders for Collaborative Filtering}, type = {article}, keywords = {acm reference format,bayesian models,collaborative filtering,implicit feedback,recommender systems,variational autoencoder}, id = {2886480b-b931-3782-974c-7bd35ba0f7f8}, created = {2021-08-30T18:48:39.133Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:09.005Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Liang}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Liang, Dawen and Hoffman, Matthew D and Ai, Google} }
@article{ title = {Auto-Encoding Variational Bayes}, type = {article}, pages = {1-14}, id = {fcb8de08-f53f-3e87-a91c-c4379d93e51e}, created = {2021-08-30T18:48:39.174Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-07T06:10:54.884Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Welling}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,8efc2fe0-ed07-4348-a865-9f1a22b45934}, private_publication = {false}, bibtype = {article}, author = {Kingma, Diederik P. and Welling, Max}, number = {Ml} }
@article{ title = {PCT: Point Cloud Transformer}, type = {article}, id = {cc7a7130-63fe-34de-b847-7b6a1b6e20e2}, created = {2021-08-31T10:43:53.112Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T07:14:33.942Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {The irregular domain and lack of ordering make it challenging to design deep neural networks for point cloud processing. This paper presents a novel framework named Point Cloud Transformer(PCT) for point cloud learning. PCT is based on Transformer, which achieves huge success in natural language processing and displays great potential in image processing. It is inherently permutation invariant for processing a sequence of points, making it well-suited for point cloud learning. To better capture local context within the point cloud, we enhance input embedding with the support of farthest point sampling and nearest neighbor search. Extensive experiments demonstrate that the PCT achieves the state-of-the-art performance on shape classification , part segmentation, semantic segmentation and normal estimation tasks.}, bibtype = {article}, author = {Guo, Hao and Cai, Jun-Xiong and Liu, Zheng-Ning and Mu, Tai-Jiang and Martin, Ralph R and Hu, Shi-Min} }
@article{ title = {End-to-End Object Detection with Transformers}, type = {article}, websites = {https://github.com/facebookresearch/detr.}, id = {1d30f05b-7b5c-3696-94fe-5f91124ed3d0}, created = {2021-08-31T10:49:43.291Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-03T08:54:34.053Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Nicolas Carion 1,2[0000−0002−2308−9680] , Francisco Massa 2[000−0003−0697−6664] , Gabriel Synnaeve 2[0000−0003−1715−3356] , Nicolas Usunier 2[0000−0002−9324−1457] , Alexander Kirillov 2[0000−0003−3169−3199] , and Sergey Zagoruyko 2[0000−0001−9684−5240] Abstract. We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bi-partite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster R-CNN baseline on the challenging COCO object detection dataset. Moreover , DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. Training code and pretrained models are available at https://github.com/facebookresearch/detr.}, bibtype = {article}, author = {} }
@article{ title = {Point Transformer}, type = {article}, id = {ae814e96-bc09-392a-b70d-d88092184054}, created = {2021-08-31T11:08:17.094Z}, accessed = {2021-08-31}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-24T06:12:49.089Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4,11276190-b8fe-4c3a-a42f-f604438ad4db}, private_publication = {false}, abstract = {In this work, we present Point Transformer, a deep neu-ral network that operates directly on unordered and unstructured point sets. We design Point Transformer to extract local and global features and relate both representations by introducing the local-global attention mechanism, which aims to capture spatial point relations and shape information. For that purpose, we propose SortNet, as part of the Point Transformer, which induces input permutation invariance by selecting points based on a learned score. The output of Point Transformer is a sorted and permutation invariant feature list that can directly be incorporated into common computer vision applications. We evaluate our approach on standard classification and part segmentation benchmarks to demonstrate competitive results compared to the prior work.}, bibtype = {article}, author = {Engel, Nico and Belagiannis, Vasileios and Dietmayer, Klaus} }
@article{ title = {Efficient Transformers: A Survey}, type = {article}, keywords = {Atten-tion Models,Deep Learning,Natural Language Processing,Transformer Models}, id = {06659490-5e50-37ca-99d2-70708616d622}, created = {2021-09-06T09:55:47.694Z}, accessed = {2021-09-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-06T11:04:51.925Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Transformer model architectures have garnered immense interest lately due to their effectiveness across a range of domains like language, vision and reinforcement learning. In the field of natural language processing for example, Transformers have become an indispensable staple in the modern deep learning stack. Recently, a dizzying number of "X-former" models have been proposed-Reformer, Linformer, Performer, Longformer, to name a few-which improve upon the original Transformer architecture, many of which make improvements around computational and memory efficiency. With the aim of helping the avid researcher navigate this flurry, this paper characterizes a large and thoughtful selection of recent efficiency-flavored "X-former" models, providing an organized and comprehensive overview of existing work and models across multiple domains.}, bibtype = {article}, author = {Tay, Yi and Research, Google and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald} }
@article{ title = {Appendix for: Graph Attention Convolution for Point Cloud Segmentation}, type = {article}, websites = {https://engineering.purdue.edu/~jshan/publications/2019/CVPR/CVPR2019 Graph Attention Convolution for Point Cloud Segmentation Appendix for Submission 4649.pdf}, id = {9569c98c-61de-3422-85bf-9e6588c0c0d7}, created = {2021-09-21T05:20:39.022Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-09-21T05:20:41.594Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {f8d4d36f-8136-4a85-8d1a-ceaffb92ddf1}, private_publication = {false}, bibtype = {article}, author = {Wang, Lei and Huang, Yuchun and Hou, Yaolin and Zhang, Shenman and Shan, Jie}, journal = {Cvpr} }
@article{ title = {View fusion for 3D Shape Recognition}, type = {article}, keywords = {3d shape recognition,multi-view,multimodel attention network,point-cloud,point-view fusion}, id = {ae90134b-57dc-3494-b0f1-9e079b330e3b}, created = {2021-09-27T07:36:18.544Z}, file_attached = {true}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:02.893Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Zhao}, folder_uuids = {c3a38ded-ec49-4494-8518-35cbd444f0c8,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Zhao, Yaxin and Jiao, Jichao and Zhang, Tangkun and Chen, Xinping and Wang, Chenxu and Cui, Wei} }
@misc{ title = {Variational Autoencoder for 3D Voxel Compression.pdf}, type = {misc}, id = {05c6cca5-1550-3ff6-875b-4bca841017dd}, created = {2021-09-30T06:39:36.336Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-08T11:04:44.205Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {misc}, author = {Al., Liu et} }
@article{ title = {Learning Sparse High Dimensional Filters: Image Filtering, Dense CRFs and Bilateral Neural Networks}, type = {article}, id = {63aa06d3-d844-33f6-ada8-c7f09e4f81e5}, created = {2021-10-11T10:29:12.669Z}, accessed = {2021-10-11}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-11T10:29:16.024Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {70eb910f-9399-46d8-a4d0-ade5435237b7,8d050117-e419-4b32-ad70-c875c74fa2b4}, private_publication = {false}, abstract = {Bilateral filters have wide spread use due to their edge-preserving properties. The common use case is to manually choose a parametric filter type, usually a Gaussian filter. In this paper, we will generalize the parametrization and in particular derive a gradient descent algorithm so the filter parameters can be learned from data. This derivation allows to learn high dimensional linear filters that operate in sparsely populated feature spaces. We build on the per-mutohedral lattice construction for efficient filtering. The ability to learn more general forms of high-dimensional filters can be used in several diverse applications. First, we demonstrate the use in applications where single filter applications are desired for runtime reasons. Further, we show how this algorithm can be used to learn the pair-wise potentials in densely connected conditional random fields and apply these to different image segmentation tasks. Finally, we introduce layers of bilateral filters in CNNs and propose bilateral neural networks for the use of high-dimensional sparse data. This view provides new ways to encode model structure into network architectures. A diverse set of experiments empirically validates the usage of general forms of filters.}, bibtype = {article}, author = {Jampani, Varun and Kiefel, Martin and Gehler, Peter V} }
@article{ title = {Point Cloud Augmentation with Weighted Local Transformations}, type = {article}, pages = {548-557}, id = {489e134c-9b5d-3d95-ad4c-5181dd8c2786}, created = {2021-10-13T14:40:10.781Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:54.131Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Hwang, Dasol and Lee, Sanghyeok and Kim, Sihyeon and Lee, Jaewon and Hwang, Seong Jae and Kim, Hyunwoo J} }
@article{ title = {Graph-based Asynchronous Event Processing for Rapid Object Recognition}, type = {article}, pages = {934-943}, id = {5017214c-7d48-3c0a-9c53-246bbb03466d}, created = {2021-10-13T14:40:10.785Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:30.768Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Li, Yijin and Zhou, Han and Yang, Bangbang and Zhang, Ye and Cui, Zhaopeng and Bao, Hujun and Zhang, Guofeng}, number = {61822310} }
@article{ title = {Dance with Self-Attention : A New Look of Conditional Random Fields on Anomaly Detection in Videos}, type = {article}, pages = {173-183}, id = {f81368a0-f864-3e2d-85fd-005ec15a11f1}, created = {2021-10-13T14:40:10.898Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:31.643Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Purwanto, Didik and Chen, Yie-tarng and Fang, Wen-hsien} }
@article{ title = {???????????? ???????? ?? ???? ????????? ?????????? ??? ?? ????? ?????? ?? ?? ?????????? ????????}, type = {article}, pages = {12466-12477}, volume = {1}, id = {8440e10d-eb37-3598-8049-80b0876a62ad}, created = {2021-10-13T14:40:11.285Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:53.758Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {371 catarrhine features, rather than derived indicators of affinity with the great ape and human clade. In this way, the Victoriapithecus skull shows that the anatomy of fossil cercopithecoids is as important as that of hominoids for deciphering the evolutionary history of Old World higher primates.}, bibtype = {article}, author = {Morphology, The Comparative} }
@article{ title = {A Robust Loss for Point Cloud Registration}, type = {article}, pages = {6138-6147}, id = {b3ad8459-43e3-3c8d-826a-3df0606bcba0}, created = {2021-10-13T14:40:11.482Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:24.068Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Deng, Zhi and Yao, Yuxin and Deng, Bailin and Zhang, Juyong} }
@article{ title = {Augmenting Depth Estimation with Geospatial Context}, type = {article}, pages = {4562-4571}, id = {f84a8390-ca35-30cb-9f9a-c43db1936361}, created = {2021-10-13T14:40:11.615Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:24.561Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Workman, Scott and Blanton, Hunter} }
@article{ title = {VENet : Voting Enhancement Network for 3D Object Detection}, type = {article}, pages = {3712-3721}, id = {3858eba4-a84b-3875-b59b-dcd13557f4e5}, created = {2021-10-13T14:40:11.715Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:25.365Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Xie, Qian and Lai, Yu-kun and Wu, Jing and Wang, Zhoutao and Lu, Dening and Wei, Mingqiang and Wang, Jun} }
@article{ title = {Domain-Invariant Disentangled Network for Generalizable Object Detection}, type = {article}, pages = {8771-8780}, id = {6c07d24d-23f3-3a72-a32d-129a8bf06bf2}, created = {2021-10-13T14:40:11.867Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:21.612Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {We address the problem of domain generalizable object detection, which aims to learn a robust detector from multiple "seen" domains so that it can generalize well to other "unseen" domains. The generalization ability is crucial in practical scenarios especially when it is difficult to collect data. Compared to image classification, domain generalization in object detection has seldom been explored with more challenges brought by domain gaps on both image and instance levels. In this paper, we propose a novel generaliz-able object detection model, termed Domain-Invariant Disentangled Network (DIDN). In contrast to directly aligning multiple sources, we integrate a disentangled network into Faster R-CNN. By disentangling representations on both image and instance levels, DIDN is able to learn domain-invariant representations that are suitable for generalized object detection. Furthermore, we design a cross-level representation reconstruction to complement this two-level disentanglement so that informative object representations could be preserved. Extensive experiments are conducted on five benchmark datasets and the results demonstrate that our model achieves state-of-the-art performances on domain generalization for object detection.}, bibtype = {article}, author = {Lin, Chuang and Zhao, Sicheng and Wang, Changhu} }
@article{ title = {Learning to Hallucinate Examples from Extrinsic and Intrinsic Supervision}, type = {article}, pages = {8701-8711}, id = {9863bb40-8bb8-3f80-9c79-396eee4a723e}, created = {2021-10-13T14:40:11.897Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:21.912Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Gui, Liangke and Bardes, Adrien and Salakhutdinov, Ruslan and Hauptmann, Alexander and Hebert, Martial and Wang, Yu-xiong} }
@article{ title = {Attention is not Enough : Mitigating the Distribution Discrepancy in Asynchronous Multimodal Sequence Fusion}, type = {article}, pages = {8148-8156}, id = {154506a4-ef0d-3ed7-8e7f-91bc6e16c704}, created = {2021-10-13T14:40:11.964Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:22.244Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Lin, Guosheng and Feng, Lei and Zhang, Yan and Lv, Fengmao} }
@article{ title = {Can Shape Structure Features Improve Model Robustness under Diverse Adversarial Settings ?}, type = {article}, id = {0b8a71c0-e98f-35a1-929f-88e145872c1a}, created = {2021-10-13T14:40:12.025Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:53:55.037Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Sun, Mingjie and Li, Zichao and Xiao, Chaowei and Qiu, Haonan and Kailkhura, Bhavya and Liu, Mingyan and Li, Bo} }
@article{ title = {Towards Better Explanations of Class Activation Mapping}, type = {article}, pages = {1336-1344}, id = {540eb40c-b805-3516-944d-9ca135f5186c}, created = {2021-10-13T14:40:12.404Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:30.096Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Jung, Hyungsik} }
@article{ title = {Towards Rotation Invariance in Object Detection}, type = {article}, id = {b30348da-678b-32d8-a6a8-b79c1cc65786}, created = {2021-10-13T14:40:12.477Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:25.673Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, abstract = {Rotation augmentations generally improve a model's in-variance/equivariance to rotation-except in object detection. In object detection the shape is not known, therefore rotation creates a label ambiguity. We show that the de-facto method for bounding box label rotation, the Largest Box Method, creates very large labels, leading to poor performance and in many cases worse performance than using no rotation at all. We propose a new method of rotation augmentation that can be implemented in a few lines of code. First, we create a differentiable approximation of label accuracy and show that axis-aligning the bounding box around an ellipse is optimal. We then introduce Rotation Uncertainty (RU) Loss, allowing the model to adapt to the uncertainty of the labels. On five different datasets (includ-ing COCO, PascalVOC, and Transparent Object Bin Picking), this approach improves the rotational invariance of both one-stage and two-stage architectures when measured with AP, AP50, and AP75. The code is available at https: //github.com/akasha-imaging/ICCV2021.}, bibtype = {article}, author = {Kalra, Agastya and Stoppi, Guy and Brown, Bradley and Agarwal, Rishav and Kadambi, Achuta} }
@article{ title = {TOOD : Task-aligned One-stage Object Detection}, type = {article}, pages = {3510-3519}, id = {7e443e1b-b556-3503-90a3-fa5d4f6609c7}, created = {2021-10-13T14:40:12.535Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:26.007Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Feng, Chengjian and Scott, Matthew R} }
@article{ title = {Oriented R-CNN for Object Detection}, type = {article}, pages = {3520-3529}, id = {0b1c5c3e-4cde-38c7-ae32-be3e37ca47b6}, created = {2021-10-13T14:40:12.561Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:25.853Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Xie, Xingxing and Cheng, Gong and Wang, Jiabao and Yao, Xiwen and Han, Junwei} }
@article{ title = {3DVG-Transformer : Relation Modeling for Visual Grounding on Point Clouds}, type = {article}, pages = {2928-2937}, id = {2b180275-0f80-3a4b-a775-054a3156f9f0}, created = {2021-10-13T14:40:12.799Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-18T06:16:27.841Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b,be408929-b86c-42e8-9ccd-02ff8d8707f0,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Zhao, Lichen and Cai, Daigang and Sheng, Lu and Xu, Dong} }
@article{ title = {GLiT: Neural Architecture Search for Global and Local Image Transformer}, type = {article}, websites = {https://github.com/bychen515/GLiT.}, id = {79541381-579b-345a-8ac1-21949600a92c}, created = {2021-10-14T06:53:13.322Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T06:53:16.755Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {We introduce the first Neural Architecture Search (NAS) method to find a better transformer architecture for image recognition. Recently, transformers without CNN-based backbones are found to achieve impressive performance for image recognition. However, the transformer is designed for NLP tasks and thus could be sub-optimal when directly used for image recognition. In order to improve the visual representation ability for transformers, we propose a new search space and searching algorithm. Specifically, we introduce a locality module that models the local correlations in images explicitly with fewer computational cost. With the locality module, our search space is defined to let the search algorithm freely trade off between global and local information as well as optimizing the low-level design choice in each module. To tackle the problem caused by huge search space, a hierarchical neural architecture search method is proposed to search the optimal vision transformer from two levels separately with the evolutionary algorithm. Extensive experiments on the ImageNet dataset demonstrate that our method can find more discriminative and efficient transformer variants than the ResNet family (e.g., ResNet101) and the baseline ViT for image classification. The source codes are available at https://github.com/bychen515/GLiT.}, bibtype = {article}, author = {Chen, Boyu and Li, Peixia and Li, Chuming and Li, Baopu and Bai, Lei and Lin, Chen and Sun, Ming and Yan, Junjie and Ouyang, Wanli} }
@article{ title = {Incorporating Convolution Designs into Visual Transformers}, type = {article}, id = {dc10ba7f-18de-3706-a451-1d077e197da7}, created = {2021-10-14T06:57:08.010Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T06:57:11.342Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {Motivated by the success of Transformers in natural language processing (NLP) tasks, there emerge some attempts (e.g., ViT and DeiT) to apply Transformers to the vision domain. However, pure Transformer architectures often require a large amount of training data or extra supervision to obtain comparable performance with convolutional neural networks (CNNs). To overcome these limitations, we analyze the potential drawbacks when directly borrowing Transformer architectures from NLP. Then we propose a new Convolution-enhanced image Transformer (CeiT) which combines the advantages of CNNs in extracting low-level features, strengthening locality, and the advantages of Transformers in establishing long-range dependencies. Three modifications are made to the original Transformer: 1) instead of the straightforward tokenization from raw input images, we design an Image-to-Tokens (I2T) module that extracts patches from generated low-level features; 2) the feed-froward network in each encoder block is replaced with a Locally-enhanced Feed-Forward (LeFF) layer that promotes the correlation among neighboring tokens in the spatial dimension; 3) a Layer-wise Class token Attention (LCA) is attached at the top of the Transformer that utilizes the multi-level representations. Experimental results on ImageNet and seven downstream tasks show the effectiveness and generalization ability of CeiT compared with previous Transformers and state-of-the-art CNNs, without requiring a large amount of training data and extra CNN teachers. Besides, CeiT models also demonstrate better convergence with 3× fewer training iterations, which can reduce the training cost significantly 1 .}, bibtype = {article}, author = {Yuan, Kun and Guo, Shaopeng and Liu, Ziwei and Zhou, Aojun and Yu, Fengwei and Wu, Wei} }
@article{ title = {An End-to-End Transformer Model for 3D Object Detection}, type = {article}, websites = {https://facebookresearch.github.io/3detr}, id = {1ab6163b-1136-34a5-a363-71473231a9e5}, created = {2021-10-14T07:00:22.874Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:00:26.711Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {We propose 3DETR, an end-to-end Transformer based object detection model for 3D point clouds. Compared to existing detection methods that employ a number of 3D-specific inductive biases, 3DETR requires minimal modifications to the vanilla Transformer block. Specifically, we find that a standard Transformer with non-parametric queries and Fourier positional embeddings is competitive with specialized architectures that employ libraries of 3D-specific operators with hand-tuned hyperparameters. Nevertheless , 3DETR is conceptually simple and easy to implement , enabling further improvements by incorporating 3D domain knowledge. Through extensive experiments, we show 3DETR outperforms the well-established and highly optimized VoteNet baselines on the challenging ScanNetV2 dataset by 9.5%. Furthermore, we show 3DETR is applicable to 3D tasks beyond detection, and can serve as a building block for future research.}, bibtype = {article}, author = {Misra, Ishan and Girdhar, Rohit and Joulin, Armand} }
@article{ title = {TempNet : Online Semantic Segmentation on Large-scale Point Cloud Series}, type = {article}, id = {df49344b-d6cf-37d5-a143-e8708f4ab3d2}, created = {2021-10-14T07:07:44.820Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:07:52.122Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, bibtype = {article}, author = {Zhou, Yunsong and Zhu, Hongzi and Li, Chunqin and Cui, Tiankai and Chang, Shan and Guo, Minyi} }
@article{ title = {Pyramid Point Cloud Transformer for Large-Scale Place Recognition}, type = {article}, pages = {6098-6107}, id = {ebae946d-1983-316b-886b-e074f95ed87f}, created = {2021-10-14T07:07:44.937Z}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:23:44.333Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, bibtype = {article}, author = {Hui, Le and Yang, Hang and Cheng, Mingmei and Xie, Jin and Yang, Jian} }
@article{ title = {No Title}, type = {article}, websites = {https://arxiv.org/pdf/2103.17154.pdf}, id = {ab33f65f-fe28-328d-8dfe-cb0193dd5524}, created = {2021-10-14T07:09:27.771Z}, file_attached = {false}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:09:27.771Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {} }
@article{ title = {Learning Spatio-Temporal Transformer for Visual Tracking}, type = {article}, id = {3a00966d-fdc2-3507-82de-f00b969237ea}, created = {2021-10-14T07:09:43.108Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:09:45.830Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {In this paper, we present a new tracking architecture with an encoder-decoder transformer as the key component. The encoder models the global spatio-temporal feature dependencies between target objects and search regions, while the decoder learns a query embedding to predict the spatial positions of the target objects. Our method casts object tracking as a direct bounding box prediction problem, without using any proposals or predefined anchors. With the encoder-decoder transformer, the prediction of objects just uses a simple fully-convolutional network, which estimates the corners of objects directly. The whole method is end-to-end, does not need any postprocessing steps such as cosine window and bounding box smoothing, thus largely simplifying existing tracking pipelines. The proposed tracker achieves state-of-the-art performance on five challenging short-term and long-term benchmarks, while running at real-time speed, being 6× faster than Siam R-CNN [47]. Code and models are open-sourced at here.}, bibtype = {article}, author = {Yan, Bin and Peng, Houwen and Fu, Jianlong and Wang, Dong and Lu, Huchuan} }
@article{ title = {Cloud Transformers: A Universal Approach To Point Cloud Processing Tasks}, type = {article}, id = {72a9d97b-349f-3564-9a40-96e8bb3489fe}, created = {2021-10-14T07:12:02.671Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:12:06.030Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {We present a new versatile building block for deep point cloud processing architectures that is equally suited for diverse tasks. This building block combines the ideas of spatial transformers and multi-view convolutional networks with the efficiency of standard convolutional layers in two and three-dimensional dense grids. The new block operates via multiple parallel heads, whereas each head dif-ferentiably rasterizes feature representations of individual points into a low-dimensional space, and then uses dense convolution to propagate information across points. The results of the processing of individual heads are then combined together resulting in the update of point features. Using the new block, we build architectures for both dis-criminative (point cloud segmentation, point cloud classification) and generative (point cloud inpainting and image-based point cloud reconstruction) tasks. The resulting ar-chitectures achieve state-of-the-art performance for these tasks, demonstrating the versatility of the new block for point cloud processing.}, bibtype = {article}, author = {Mazur, Kirill and Lempitsky, Victor} }
@article{ title = {AutoFormer: Searching Transformers for Visual Recognition}, type = {article}, websites = {https://github.com/microsoft/Cream.}, id = {4d4eeae3-060e-3543-8283-5c626cf4e16e}, created = {2021-10-14T07:15:41.912Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:15:48.426Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {Recently, pure transformer-based models have shown great potentials for vision tasks such as image classification and detection. However, the design of transformer networks is challenging. It has been observed that the depth, embedding dimension, and number of heads can largely affect the performance of vision transformers. Previous models configure these dimensions based upon manual craft-ing. In this work, we propose a new one-shot architecture search framework, namely AutoFormer, dedicated to vision transformer search. AutoFormer entangles the weights of different blocks in the same layers during supernet training. Benefiting from the strategy, the trained supernet allows thousands of subnets to be very well-trained. Specifically , the performance of these subnets with weights inherited from the supernet is comparable to those retrained from scratch. Besides, the searched models, which we refer to AutoFormers, surpass the recent state-of-the-arts such as ViT and DeiT. In particular, AutoFormer-tiny/small/base achieve 74.7%/81.7%/82.4% top-1 accuracy on ImageNet with 5.7M/22.9M/53.7M parameters, respectively. Lastly, we verify the transferability of AutoFormer by providing the performance on downstream benchmarks and distillation experiments. Code and models are available at https://github.com/microsoft/Cream.}, bibtype = {article}, author = {Chen, Minghao and Peng, Houwen and Fu, Jianlong and Ling, Haibin} }
@article{ title = {Understanding Robustness of Transformers for Image Classification}, type = {article}, id = {64a7c3e0-ceb4-3569-a940-986e3b4f4e21}, created = {2021-10-14T07:26:21.352Z}, accessed = {2021-10-14}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-14T07:26:21.950Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {865cda55-bfc4-4a99-88d6-7092e1cbba3b}, private_publication = {false}, abstract = {Deep Convolutional Neural Networks (CNNs) have long been the architecture of choice for computer vision tasks. Recently, Transformer-based architectures like Vision Transformer (ViT) have matched or even surpassed ResNets for image classification. However, details of the Transformer architecture-such as the use of non-overlapping patches-lead one to wonder whether these networks are as robust. In this paper, we perform an extensive study of a variety of different measures of robustness of ViT models and compare the findings to ResNet baselines. We investigate robustness to input perturbations as well as robustness to model perturbations. We find that when pre-trained with a sufficient amount of data, ViT models are at least as robust as the ResNet counterparts on a broad range of perturbations. We also find that Transformers are robust to the removal of almost any single layer, and that while activations from later layers are highly correlated with each other, they nevertheless play an important role in classification.}, bibtype = {article}, author = {Bhojanapalli, Srinadh and Chakrabarti, Ayan and Glasner, Daniel and Li, Daliang and Unterthiner, Thomas and Veit, Andreas} }
@article{ title = {Evaluation of Latent Space Learning with Procedurally-Generated Datasets of Shapes}, type = {article}, pages = {2086-2094}, id = {e959fec0-9376-31b5-987f-6cde2428cdbe}, created = {2021-10-19T05:56:45.611Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.473Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Ali}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,a6db5ca6-7f95-48a4-bc40-9e41eea78434,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Ali, Sharjeel and Kaick, Oliver Van} }
@article{ title = {Discriminative Regularization of the Latent Manifold of}, type = {article}, keywords = {knowledge,perceptual data compaction,regularization,representation,semi-supervised learning,variational auto-encoder}, id = {2cfbaa76-2939-3555-90af-b69f0062722a}, created = {2021-10-20T06:55:29.214Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.322Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Auto-encoders}, folder_uuids = {a6db5ca6-7f95-48a4-bc40-9e41eea78434}, private_publication = {false}, bibtype = {article}, author = {Auto-encoders, Variational} }
@article{ title = {3D Semantic Label Transfer in Human-Robot Collaboration}, type = {article}, pages = {2602-2611}, id = {2a6d0c7c-ba26-36d5-bd69-805c5b8da18d}, created = {2021-10-25T06:33:57.686Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.848Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Szeier}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,1853f94b-7af1-40fa-b068-4758e9a02bc4,3de7428d-3017-4875-84d1-70688a7156ea,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Szeier, Szilvia and Labs, Nokia Bell} }
@article{ title = {The multilayer perceptron as an approximation to a Bayes optimal discriminant function}, type = {article}, id = {321ec5d5-8aec-3750-97c9-067e7ae6fa31}, created = {2021-10-26T08:17:02.751Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:08.822Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {Morphology}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,cd02f564-0123-4236-a320-b339927f085a}, private_publication = {false}, abstract = {371 catarrhine features, rather than derived indicators of affinity with the great ape and human clade. In this way, the Victoriapithecus skull shows that the anatomy of fossil cercopithecoids is as important as that of hominoids for deciphering the evolutionary history of Old World higher primates.}, bibtype = {article}, author = {Morphology, The Comparative} }
@article{ title = {Amplitude-Phase Recombination : Rethinking Robustness of Convolutional Neural Networks in Frequency Domain}, type = {article}, pages = {458-467}, id = {36152d17-666e-325d-8382-d940aec81075}, created = {2021-10-30T07:28:03.606Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:32:28.141Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Chen, Guangyao and Peng, Peixi and Ma, Li and Li, Jia and Du, Lin and Tian, Yonghong} }
@article{ title = {Occlude Them All : Occlusion-Aware Attention Network for Occluded Person Re-ID}, type = {article}, pages = {11833-11842}, id = {f8207d4e-593c-3e4e-933d-40c34d9590d8}, created = {2021-10-30T07:28:03.620Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:32:19.720Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Chen, Peixian and Liu, Wenfeng and Dai, Pingyang and Liu, Jianzhuang and Ye, Qixiang and Xu, Mingliang and Ji, Rongrong} }
@article{ title = {MAAS : Multi-modal Assignation for Active Speaker Detection}, type = {article}, pages = {265-274}, id = {e2985cf2-ab33-3fc8-8078-00246021b27c}, created = {2021-10-30T07:28:03.730Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:32:54.270Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Le, Juan and Heilbron, Fabian Caba and Thabet, Ali K and Ghanem, Bernard} }
@article{ title = {Aggregation with Feature Detection}, type = {article}, pages = {527-536}, id = {cb207c9e-1fb6-3779-a13f-bea1d94ca09c}, created = {2021-10-30T07:28:03.830Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:47.714Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Sun, Shuyang and Yue, Xiaoyu and Qi, Xiaojuan and Ouyang, Wanli and Prisacariu, Victor and Torr, Philip} }
@article{ title = {Self-supervised Geometric Features Discovery via Interpretable Attention for Vehicle Re-Identification and Beyond}, type = {article}, pages = {194-204}, id = {75c57693-4df3-3f89-a12a-f89f2b64472a}, created = {2021-10-30T07:28:04.064Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:29:14.720Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Li, Ming} }
@article{ title = {Guided Point Contrastive Learning for Semi-supervised Point Cloud Semantic Segmentation}, type = {article}, pages = {6423-6432}, id = {d09881a3-e317-3476-a82d-d4b5137757c9}, created = {2021-10-30T07:28:04.152Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:29:29.663Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, bibtype = {article}, author = {Jiang, Li} }
@article{ title = {DWKS : A Local Descriptor of Deformations Between Meshes and Point Clouds : Supplementary material}, type = {article}, pages = {0-5}, id = {0c1676c3-4da6-39e4-8719-cc9d7f5ad498}, created = {2021-10-30T07:28:04.295Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-10-30T07:28:58.854Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {48c2017d-ceec-4fcb-b966-b19e6f311352}, private_publication = {false}, abstract = {We propose a novel pointwise descriptor, called DWKS, aimed at finding correspondences across two deformable shape collections. Unlike the majority of existing descriptors, rather than capturing local geometry, DWKS captures the deformation around a point within a collection in a multi-scale and informative manner. This, in turn, allows to compute inter-collection correspondences without using landmarks. To this end, we build upon the successful spectral WKS descriptors, but rather than using the Laplace-Beltrami operator, show that a similar construction can be performed on shape difference operators, that capture differences or distortion within a collection. By leveraging the collection information our descriptor facilitates difficult non-rigid shape matching tasks, even in the presence of strong partiality and significant deformations. We demonstrate the utility of our approach across a range of challenging matching problems on both meshes and point clouds. The code for this paper can be found at https://github.com/RobinMagnet/DWKS.}, bibtype = {article}, author = {Magnet, Robin} }
@article{ title = {Compressed Object Detection}, type = {article}, pages = {2-4}, id = {be7a70d4-48c6-3cbb-9e9e-5ae8b1b369af}, created = {2021-11-17T16:08:00.489Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:08.622Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Muhawenayo, Gedeon and Gkioxari, Georgia} }
@article{ title = {Mesh R-CNN}, type = {article}, pages = {9785-9795}, id = {2f09c8ae-970a-3348-aee5-c1435f454879}, created = {2021-11-17T16:08:00.533Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:07.685Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Gkioxari, Georgia and Ai, Facebook} }
@article{ title = {Kimera : an Open-Source Library for Real-Time Metric-Semantic Localization and Mapping}, type = {article}, id = {0f4b03ea-8ff9-3525-9b9f-ec7513cfe505}, created = {2021-11-17T16:08:00.549Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-17T16:08:07.354Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {dbd9a6d6-88f6-4a62-9acd-402fb473145a,b09853a1-601b-4dfc-9176-c3c7d469342b}, private_publication = {false}, bibtype = {article}, author = {Rosinol, Antoni and Abate, Marcus and Chang, Yun and Carlone, Luca} }
@article{ title = {Point3D : tracking actions as moving points with 3D CNNs}, type = {article}, pages = {1-14}, id = {576d833d-cdc4-353c-bb8c-db24ff4b2e64}, created = {2021-11-23T08:03:05.319Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:04:32.582Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Mo, Shentong} }
@article{ title = {Local and Global Point Cloud Reconstruction for 3D Hand Pose Estimation}, type = {article}, pages = {1-15}, id = {618ff331-6bef-3ec6-858b-4246a9440756}, created = {2021-11-23T08:03:05.321Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:06:30.800Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Yu, Ziwei} }
@article{ title = {Rethinking Local and Global Feature Representation for Semantic Segmentation}, type = {article}, pages = {1-14}, id = {5ccc8a99-ec4c-3c94-9961-c4b1ae0008b5}, created = {2021-11-23T08:03:05.460Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-08-18T10:51:28.849Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Chen, Mohan} }
@article{ title = {DeepUME : Learning the Universal Manifold Embedding for Robust Point Cloud}, type = {article}, pages = {1-14}, id = {56484d01-4009-3e0e-bd47-bfd2c6ceff13}, created = {2021-11-23T08:03:05.615Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:03:29.943Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Lang, Natalie and Francos, Joseph M} }
@article{ title = {On Automatic Data Augmentation for 3D Point Cloud Classification}, type = {article}, id = {2bd72598-cca4-3524-b051-c79fae6f8220}, created = {2021-11-23T08:03:05.622Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:11:34.979Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d0ca493b-f12a-45f6-9df1-0ad1fe3a78ff}, private_publication = {false}, bibtype = {article}, author = {Zhang, Wanyue} }
@article{ title = {3D Object Tracking with Transformer}, type = {article}, websites = {https://github.com/3bobo/lttr.}, id = {35280b7c-a9be-30a5-bb72-e1beec76ebc2}, created = {2021-11-23T08:18:33.197Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:18:36.087Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {Feature fusion and similarity computation are two core problems in 3D object tracking , especially for object tracking using sparse and disordered point clouds. Feature fusion could make similarity computing more efficient by including target object information. However, most existing LiDAR-based approaches directly use the extracted point cloud feature to compute similarity while ignoring the attention changes of object regions during tracking. In this paper, we propose a feature fusion network based on transformer architecture. Benefiting from the self-attention mechanism, the transformer encoder captures the inter-and intra-relations among different regions of the point cloud. By using cross-attention, the transformer decoder fuses features and includes more target cues into the current point cloud feature to compute the region attentions, which makes the similarity computing more efficient. Based on this feature fusion network, we propose an end-to-end point cloud object tracking framework, a simple yet effective method for 3D object tracking using point clouds. Comprehensive experimental results on the KITTI dataset show that our method achieves new state-of-the-art performance. Code is available at: https://github.com/3bobo/lttr.}, bibtype = {article}, author = {Cui, Yubo and Fang, Zheng and Shan, Jiayao and Gu, Zuoxu and Zhou, Sifan} }
@article{ title = {Sparse Adversarial Video Attacks with Spatial Transformations}, type = {article}, websites = {https://github.com/TrustAI/DeepSAVA}, id = {3ce5d239-9ba5-349e-a30a-ca28475da34a}, created = {2021-11-23T08:19:43.596Z}, accessed = {2021-11-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2021-11-23T08:19:46.976Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {0facbd6c-68b8-4efe-af4f-9311071d6b5c}, private_publication = {false}, abstract = {In recent years, a significant amount of research efforts concentrated on adversarial attacks on images, while adversarial video attacks have seldom been explored. We propose an adversarial attack strategy on videos, called DeepSAVA. Our model includes both additive perturbation and spatial transformation by a unified optimisation framework, where the structural similarity index measure is adopted to measure the adversarial distance. We design an effective and novel optimisation scheme which alternatively utilizes Bayesian optimisation to identify the most influential frame in a video and Stochastic gradient descent (SGD) based optimisation to produce both additive and spatial-transformed perturbations. Doing so enables DeepSAVA to perform a very sparse attack on videos for maintaining human imperceptibility while still achieving state-of-the-art performance in terms of both attack success rate and adversarial transferability. Our intensive experiments on various types of deep neural networks and video datasets confirm the superiority of DeepSAVA.}, bibtype = {article}, author = {Marcolino, Soriano} }
@article{ title = {Sparse-to-Dense Feature Matching}, type = {article}, keywords = {classification,feature matching,visual localization}, pages = {11-13}, id = {b85be501-1dae-3222-8da3-e55d3f8d72a1}, created = {2022-01-03T10:35:44.653Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-03T10:35:51.953Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {d086c635-a2ea-4edd-a71f-9534cab096cc}, private_publication = {false}, bibtype = {article}, author = {Germain, Hugo and Bourmaud, Guillaume and Lepetit, Vincent} }
@article{ title = {Self-Supervised 3D Keypoint Learning for Ego-Motion Estimation}, type = {article}, keywords = {keypoints,monocular,self-supervised-learning,visual odometry}, pages = {1-18}, id = {c772bead-39a9-371f-b2fa-2d0c5f779b21}, created = {2022-01-03T10:35:44.660Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.852Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Tang}, folder_uuids = {3de7428d-3017-4875-84d1-70688a7156ea}, private_publication = {false}, bibtype = {article}, author = {Tang, Jiexiong and Guizilini, Vitor and Pillai, Sudeep and Kim, Hanme and Jensfelt, Patric and Gaidon, Adrien} }
@article{ title = {Measuring Distance between Reeb Graphs [ Extended abstract ]}, type = {article}, pages = {464-473}, id = {cde31c9f-b975-3624-aa07-35b7207cb761}, created = {2022-01-14T16:04:11.927Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-14T16:04:23.706Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Bauer, Ulrich and Wang, Yusu} }
@article{ title = {RGB-D Scene Understanding}, type = {article}, volume = {1}, id = {98a4676d-1abc-33a7-9d1a-6622ffae3ea1}, created = {2022-01-18T11:20:44.051Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-19T16:11:26.025Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, bibtype = {article}, author = {Fan, Qingnan and Berkeley, U C} }
@article{ title = {RandLA-Net : Efficient Semantic Segmentation of Large-Scale Point Clouds}, type = {article}, id = {1cde5fa0-cbf5-3b5d-8c4a-bd06d0f2186d}, created = {2022-01-18T11:20:44.052Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-26T08:55:41.655Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, bibtype = {article}, author = {Hu, Qingyong and Yang, Bo and Xie, Linhai and Rosa, Stefano and Guo, Yulan and Wang, Zhihua and Trigoni, Niki and Markham, Andrew} }
@article{ title = {Geometric Capsule Autoencoders for 3D Point Clouds}, type = {article}, id = {a3a9ce12-e1d0-3d35-ac7e-e805ed671227}, created = {2022-01-18T11:20:44.054Z}, file_attached = {false}, profile_id = {bfbbf840-4c42-3914-a463-19024f50b30c}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:07.636Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Srivastava}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, bibtype = {article}, author = {Srivastava, Nitish and Goh, Hanlin and Salakhutdinov, Ruslan} }
@article{ title = {Context-Aware Dynamic Feature Extraction for 3D Object Detection in Point Clouds}, type = {article}, id = {01ee1aed-cd37-3949-be87-b0a29dd3a024}, created = {2022-01-18T11:44:35.176Z}, accessed = {2022-01-18}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-18T11:44:40.705Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Varying density of point clouds increases the difficulty of 3D detection. In this paper, we present a context-aware dynamic network (CADNet) to capture the variance of density by considering both point context and semantic context. Point-level contexts are generated from original point clouds to enlarge the effective receptive filed. They are extracted around the voxelized pillars based on our extended voxelization method and processed with the context encoder in parallel with the pillar features. With a large perception range, we are able to capture the variance of features for potential objects and generate attentive spatial guidance to help adjust the strengths for different regions. In the region proposal network, considering the limited representation ability of traditional convolution where same kernels are shared among different samples and positions, we propose a decomposable dynamic convolutional layer to adapt to the variance of input features by learning from local semantic context. It adaptively generates the position-dependent coefficients for multiple fixed kernels and combines them to convolve with local feature windows. Based on our dynamic convolution, we design a dual-path con-volution block to further improve the representation ability. We conduct experiments on KITTI dataset and our proposed CADNet achieves good performance on 3D detection task in terms of both precision and speed. Our one-stage detector outperforms SECOND and PointPillars by a large margin and runs at the speed of 30 FPS.}, bibtype = {article}, author = {Tian, Yonglin and Huang, Lichao and Li, Xuesong and Wang, Kunfeng and Wang, Zilei and Wang, Fei-Yue} }
@article{ title = {3D Object Detection From LiDAR Data Using Distance Dependent Feature Extraction}, type = {article}, id = {bdbb058f-aae8-3560-9364-478fb702320d}, created = {2022-01-18T11:45:27.490Z}, accessed = {2022-01-18}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-18T11:45:30.428Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {This paper presents a new approach to 3D object detection that leverages the properties of the data obtained by a LiDAR sensor. State-of-the-art detectors use neural network architectures based on assumptions valid for camera images. However, point clouds obtained from LiDAR data are fundamentally different. Most detectors use shared filter kernels to extract features which do not take into account the range dependent nature of the point cloud features. To show this, different detectors are trained on two splits of the KITTI dataset: close range (points up to 25 meters from LiDAR) and long-range. Top view images are generated from point clouds as input for the networks. Combined results outperform the baseline network trained on the full dataset with a single backbone. Additional research compares the effect of using different input features when converting the point cloud to image. The results indicate that the network focuses on the shape and structure of the objects, rather than exact values of the input. This work proposes an improvement for 3D object detectors by taking into account that features change over distance in point cloud data. Results show that training separate networks for close-range and long-range objects boosts performance for all KITTI benchmark difficulties.}, bibtype = {article}, author = {Engels, Guus and Aranjuelo, Nerea and Arganda-Carreras, Ignacio and Nieto, Marcos and Otaegui, Oihana} }
@article{ title = {Fully Convolutional Geometric Features}, type = {article}, id = {3f80d16c-446e-309e-9e3c-6438afd1b270}, created = {2022-01-18T11:49:33.507Z}, accessed = {2022-01-18}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-15T12:23:58.739Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Extracting geometric features from 3D scans or point clouds is the first step in applications such as registration , reconstruction, and tracking. State-of-the-art methods require computing low-level features as input or extracting patch-based features with limited receptive field. In this work, we present fully-convolutional geometric features , computed in a single pass by a 3D fully-convolutional network. We also present new metric learning losses that dramatically improve performance. Fully-convolutional geometric features are compact, capture broad spatial context , and scale to large scenes. We experimentally validate our approach on both indoor and outdoor datasets. Fully-convolutional geometric features achieve state-of-the-art accuracy without requiring prepossessing, are compact (32 dimensions), and are 600 times faster than the most accurate prior method.}, bibtype = {article}, author = {Choy, Christopher and Park, Jaesik and Vladlen Koltun, Postech} }
@article{ title = {PCPNET Learning Local Shape Properties from Raw Point Clouds}, type = {article}, keywords = {CCS Concepts •Computing methodologies → Point-based models,Shape analysis,•Computer systems organization → Neural networks}, id = {1cf7d450-fbac-349d-b5ca-33157138c231}, created = {2022-01-18T11:51:28.036Z}, accessed = {2022-01-18}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-18T11:51:32.382Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {In this paper, we propose PCPNET, a deep-learning based approach for estimating local 3D shape properties in point clouds. In contrast to the majority of prior techniques that concentrate on global or mid-level attributes, e.g., for shape classification or semantic labeling, we suggest a patch-based learning method, in which a series of local patches at multiple scales around each point is encoded in a structured manner. Our approach is especially well-adapted for estimating local shape properties such as normals (both unoriented and oriented) and curvature from raw point clouds in the presence of strong noise and multi-scale features. Our main contributions include both a novel multi-scale variant of the recently proposed PointNet architecture with emphasis on local shape information, and a series of novel applications in which we demonstrate how learning from training data arising from well-structured triangle meshes, and applying the trained model to noisy point clouds can produce superior results compared to specialized state-of-the-art techniques. Finally, we demonstrate the utility of our approach in the context of shape reconstruction, by showing how it can be used to extract normal orientation information from point clouds.}, bibtype = {article}, author = {Guerrero, Paul and Kleiman, Yanir and Ovsjanikov, Maks and Mitra, Niloy J} }
@article{ title = {Feature axes orthogonalization in semantic face editing}, type = {article}, id = {da6843c0-aa59-39e9-aa9c-7b2a80cf8a84}, created = {2022-01-19T09:08:51.298Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-19T09:09:05.175Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {10e04504-7e21-4b84-9037-5a4431df1a8a}, private_publication = {false}, bibtype = {article}, author = {} }
@article{ title = {Grid-GCN for Fast and Scalable Point Cloud Learning}, type = {article}, websites = {https://github.com/xharlie/Grid-GCN}, id = {bde86639-d3a3-3c0b-a6ff-f809470171ee}, created = {2022-01-21T07:04:01.792Z}, accessed = {2021-01-21}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-25T07:19:37.825Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Due to the sparsity and irregularity of the point cloud data, methods that directly consume points have become popular. Among all point-based models, graph convolu-tional networks (GCN) lead to notable performance by fully preserving the data granularity and exploiting point interrelation. However, point-based networks spend a significant amount of time on data structuring (e.g., Farthest Point Sampling (FPS) and neighbor points querying), which limit the speed and scalability. In this paper, we present a method, named Grid-GCN, for fast and scalable point cloud learning. Grid-GCN uses a novel data structuring strategy, Coverage-Aware Grid Query (CAGQ). By leveraging the efficiency of grid space, CAGQ improves spatial coverage while reducing the theoretical time complexity. Compared with popular sampling methods such as Farthest Point Sampling (FPS) and Ball Query, CAGQ achieves up to 50× speed-up. With a Grid Context Aggregation (GCA) module, Grid-GCN achieves state-of-the-art performance on major point cloud classification and segmentation benchmarks with significantly faster runtime than previous studies. Remarkably , Grid-GCN achieves the inference speed of 50fps on ScanNet using 81920 points as input. The supplementary 1 and the code 2 are released.}, bibtype = {article}, author = {Xu, Qiangeng and Sun, Xudong and Wu, Cho-Ying and Wang, Panqu and Neumann, Ulrich} }
@article{ title = {PointASNL: Robust Point Clouds Processing using Nonlocal Neural Networks with Adaptive Sampling}, type = {article}, id = {9ee394a0-0dbc-3ca3-87da-10580220116f}, created = {2022-01-21T07:04:33.711Z}, accessed = {2022-01-21}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-21T12:32:20.526Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Raw point clouds data inevitably contains outliers or noise through acquisition from 3D sensors or reconstruction algorithms. In this paper, we present a novel end-to-end network for robust point clouds processing, named PointASNL, which can deal with point clouds with noise effectively. The key component in our approach is the adap-tive sampling (AS) module. It first re-weights the neighbors around the initial sampled points from farthest point sampling (FPS), and then adaptively adjusts the sampled points beyond the entire point cloud. Our AS module can not only benefit the feature learning of point clouds, but also ease the biased effect of outliers. To further capture the neighbor and long-range dependencies of the sampled point, we proposed a local-nonlocal (L-NL) module inspired by the nonlocal operation. Such L-NL module enables the learning process insensitive to noise. Extensive experiments verify the robustness and superiority of our approach in point clouds processing tasks regardless of synthesis data, indoor data, and outdoor data with or without noise. Specifically, PointASNL achieves state-of-the-art robust performance for classification and segmentation tasks on all datasets, and significantly outperforms previous methods on real-world outdoor SemanticKITTI dataset with considerate noise. Our code is released through https: //github.com/yanx27/PointASNL.}, bibtype = {article}, author = {Yan, Xu and Zheng, Chaoda and Li, Zhen and Wang, Sheng and Cui, Shuguang} }
@article{ title = {KPConv: Flexible and Deformable Convolution for Point Clouds}, type = {article}, id = {d82f29e8-2581-399b-b50d-27adb5513e96}, created = {2022-01-21T07:05:20.508Z}, accessed = {2022-01-21}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-21T07:05:24.552Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We present Kernel Point Convolution 1 (KPConv), a new design of point convolution, i.e. that operates on point clouds without any intermediate representation. The convo-lution weights of KPConv are located in Euclidean space by kernel points, and applied to the input points close to them. Its capacity to use any number of kernel points gives KP-Conv more flexibility than fixed grid convolutions. Furthermore , these locations are continuous in space and can be learned by the network. Therefore, KPConv can be extended to deformable convolutions that learn to adapt kernel points to local geometry. Thanks to a regular subsampling strategy , KPConv is also efficient and robust to varying densities. Whether they use deformable KPConv for complex tasks, or rigid KPconv for simpler tasks, our networks outperform state-of-the-art classification and segmentation approaches on several datasets. We also offer ablation studies and visualizations to provide understanding of what has been learned by KPConv and to validate the descriptive power of deformable KPConv.}, bibtype = {article}, author = {Thomas, Hugues and Qi, Charles R and Deschaud, Jean-Emmanuel and Marcotegui, Beatriz and Goulette, François and Guibas, Leonidas J} }
@article{ title = {3D Local Features for Direct Pairwise Registration}, type = {article}, id = {14c211da-2df1-3627-9fe2-96054bf650cc}, created = {2022-01-27T08:20:55.941Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-27T08:21:03.508Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We present a novel, data driven approach for solving the problem of registration of two point cloud scans. Our approach is direct in the sense that a single pair of corresponding local patches already provides the necessary transformation cue for the global registration. To achieve that, we first endow the state of the art PPF-FoldNet [19] auto-encoder (AE) with a pose-variant sibling, where the discrepancy between the two leads to pose-specific descrip-tors. Based upon this, we introduce RelativeNet, a relative pose estimation network to assign correspondence-specific orientations to the keypoints, eliminating any local reference frame computations. Finally, we devise a simple yet effective hypothesize-and-verify algorithm to quickly use the predictions and align two point sets. Our extensive quantitative and qualitative experiments suggests that our approach outperforms the state of the art in challenging real datasets of pairwise registration and that augmenting the keypoints with local pose information leads to better generalization and a dramatic speed-up.}, bibtype = {article}, author = {Deng, Haowen and Birdal, Tolga and Ilic, Slobodan} }
@article{ title = {3D3L: Deep Learned 3D Keypoint Detection and Description for LiDARs}, type = {article}, id = {fac58eac-ee6d-3725-bf86-f0386bf85a56}, created = {2022-01-27T08:23:13.713Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-27T08:23:18.543Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {With the advent of powerful, lightweight 3D LiDARs, they have become the hearth of many navigation and SLAM algorithms on various autonomous systems. Pointcloud registration methods working with unstructured pointclouds such as ICP are often computationally expensive or require a good initial guess. Furthermore, 3D feature-based registration methods have never quite reached the robustness of 2D methods in visual SLAM. With the continuously increasing resolution of LiDAR range images, these 2D methods not only become applicable but should exploit the illumination-independent modalities that come with it, such as depth and intensity. In visual SLAM, deep learned 2D features and descriptors perform exceptionally well compared to traditional methods. In this publication, we use a state-of-the-art 2D feature network as a basis for 3D3L, exploiting both intensity and depth of LiDAR range images to extract powerful 3D features. Our results show that these keypoints and descriptors extracted from LiDAR scan images outperform state-of-the-art on different benchmark metrics and allow for robust scan-to-scan alignment as well as global localization.}, bibtype = {article}, author = {Streiff, Dominic and Bernreiter, Lukas and Tschopp, Florian and Fehr, Marius and Siegwart, Roland} }
@article{ title = {Fully Convolutional Geometric Features}, type = {article}, id = {4cdf3a58-0853-31b9-a09c-61161099a403}, created = {2022-01-27T08:23:57.838Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-28T07:27:10.302Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Extracting geometric features from 3D scans or point clouds is the first step in applications such as registration , reconstruction, and tracking. State-of-the-art methods require computing low-level features as input or extracting patch-based features with limited receptive field. In this work, we present fully-convolutional geometric features , computed in a single pass by a 3D fully-convolutional network. We also present new metric learning losses that dramatically improve performance. Fully-convolutional geometric features are compact, capture broad spatial context , and scale to large scenes. We experimentally validate our approach on both indoor and outdoor datasets. Fully-convolutional geometric features achieve state-of-the-art accuracy without requiring prepossessing, are compact (32 dimensions), and are 290 times faster than the most accurate prior method.}, bibtype = {article}, author = {Choy, Christopher and Park, Jaesik and Vladlen Koltun, Postech} }
@article{ title = {Fully Convolutional Geometric Features}, type = {article}, id = {630f40a2-0b1d-3889-a841-26b4aa8d1df2}, created = {2022-01-27T08:24:37.022Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-27T08:27:00.276Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, abstract = {Extracting geometric features from 3D scans or point clouds is the first step in applications such as registration , reconstruction, and tracking. State-of-the-art methods require computing low-level features as input or extracting patch-based features with limited receptive field. In this work, we present fully-convolutional geometric features , computed in a single pass by a 3D fully-convolutional network. We also present new metric learning losses that dramatically improve performance. Fully-convolutional geometric features are compact, capture broad spatial context , and scale to large scenes. We experimentally validate our approach on both indoor and outdoor datasets. Fully-convolutional geometric features achieve state-of-the-art accuracy without requiring prepossessing, are compact (32 dimensions), and are 290 times faster than the most accurate prior method.}, bibtype = {article}, author = {Choy, Christopher and Park, Jaesik and Vladlen Koltun, Postech} }
@article{ title = {D3Feat: Joint Learning of Dense Detection and Description of 3D Local Features}, type = {article}, id = {b46fa668-efeb-37cf-ae6f-98a49a9a501c}, created = {2022-01-27T08:26:23.787Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-29T06:51:19.164Z}, read = {true}, starred = {true}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {A successful point cloud registration often lies on robust establishment of sparse matches through discriminative 3D local features. Despite the fast evolution of learning-based 3D feature descriptors, little attention has been drawn to the learning of 3D feature detectors, even less for a joint learning of the two tasks. In this paper, we leverage a 3D fully convolutional network for 3D point clouds, and propose a novel and practical learning mechanism that densely predicts both a detection score and a description feature for each 3D point. In particular, we propose a keypoint selection strategy that overcomes the inherent density variations of 3D point clouds, and further propose a self-supervised detector loss guided by the on-the-fly feature matching results during training. Finally, our method achieves state-of-the-art results in both indoor and outdoor scenarios, evaluated on 3DMatch and KITTI datasets, and shows its strong generalization ability on the ETH dataset. Towards practical use, we show that by adopting a reliable feature detector , sampling a smaller number of features is sufficient to achieve accurate and fast point cloud alignment. [code release]}, bibtype = {article}, author = {Bai, Xuyang and Luo, Zixin and Zhou, Lei and Fu, Hongbo and Quan, Long and Tai, Chiew-Lan} }
@article{ title = {Efficient 3D Point Cloud Feature Learning for Large-Scale Place Recognition}, type = {article}, keywords = {Deep Learning,Global Descriptor,Index Terms-3D Point Cloud Retrieval,Place Recognition}, websites = {https://github.com/fpthink/EPC-Net.}, id = {d629e6bc-45a7-3b95-b07e-eb46f99fd83e}, created = {2022-01-27T08:29:58.813Z}, accessed = {2022-01-27}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-01-28T07:27:10.478Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Point cloud based retrieval for place recognition is still a challenging problem due to drastic appearance and illumination changes of scenes in changing environments. Existing deep learning based global descriptors for the retrieval task usually consume a large amount of computation resources (e.g., memory), which may not be suitable for the cases of limited hardware resources. In this paper, we develop an efficient point cloud learning network (EPC-Net) to form a global descriptor for visual place recognition, which can obtain good performance and reduce computation memory and inference time. First, we propose a lightweight but effective neural network module, called ProxyConv, to aggregate the local geometric features of point clouds. We leverage the spatial adjacent matrix and proxy points to simplify the original edge convolution for lower memory consumption. Then, we design a lightweight grouped VLAD network (G-VLAD) to form global descriptors for retrieval. Compared with the original VLAD network, we propose a grouped fully connected (GFC) layer to decompose the high-dimensional vectors into a group of low-dimensional vectors, which can reduce the number of parameters of the network and maintain the discrimination of the feature vector. Finally, to further reduce the inference time, we develop a simple version of EPC-Net, called EPC-Net-L, which consists of two ProxyConv modules and one max pooling layer to aggregate global descriptors. By distilling the knowledge from EPC-Net, EPC-Net-L can obtain discrimina-tive global descriptors for retrieval. Extensive experiments on the Oxford dataset and three in-house datasets demonstrate that our proposed method can achieve state-of-the-art performance with lower parameters, FLOPs, and runtime per frame. Our code is available at https://github.com/fpthink/EPC-Net.}, bibtype = {article}, author = {Hui, Le and Cheng, Mingmei and Xie, Jin and Yang, Jian} }
@article{ title = {5 Keypoints Is All You Need}, type = {article}, id = {12520d20-e03a-363b-a645-d602ea5cc876}, created = {2022-02-09T08:28:49.712Z}, accessed = {2022-02-09}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-09T08:28:52.971Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {Pose tracking is an important problem that requires identifying unique human pose-instances and matching them temporally across different frames of a video. However, existing pose tracking methods are unable to accurately model temporal relationships and require significant computation , often computing the tracks offline. We present an efficient multi-person pose tracking method, KeyTrack, that only relies on keypoint information without using any RGB or optical flow information to track human keypoints in real-time. Keypoints are tracked using our Pose Entail-ment method, in which, first, a pair of pose estimates is sampled from different frames in a video and tokenized. Then, a Transformer-based network makes a binary classification as to whether one pose temporally follows another. Furthermore , we improve our top-down pose estimation method with a novel, parameter-free, keypoint refinement technique that improves the keypoint estimates used during the Pose Entailment step. We achieve state-of-the-art results on the PoseTrack'17 and the PoseTrack'18 benchmarks while using only a fraction of the computation required by most other methods for computing the tracking information.}, bibtype = {article}, author = {Snower, Michael and Kadav, Asim and Farley, ; and Hans, Lai ; and Graf, Peter} }
@article{ title = {Representing Shape Collections With Alignment-Aware Linear Models}, type = {article}, id = {6b4eebfd-3183-34ba-8872-66070b464972}, created = {2022-02-23T06:26:16.770Z}, file_attached = {true}, profile_id = {f3d36c73-062b-3738-9a74-d09e4e83eb1e}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-02-23T06:26:30.808Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {cb1fa476-a972-4238-b348-eb71514486bb}, private_publication = {false}, bibtype = {article}, author = {Loiseau, Romain} }
@article{ title = {Crossing Nets : Combining GANs and VAEs with a Shared Latent Space for Hand Pose Estimation}, type = {article}, id = {ea173cb4-335e-3fcd-8822-8d071f84f952}, created = {2022-03-22T06:31:37.921Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-03-28T09:45:12.259Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Wan}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Wan, Chengde and Probst, Thomas and Gool, Luc Van and Yao, Angela} }
@article{ title = {Task-Generic Hierarchical Human Motion Prior using VAEs}, type = {article}, id = {35f66dcf-a649-38c8-960d-71495c3b5828}, created = {2022-03-24T06:46:04.756Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:01.981Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Kuang}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4,465973dc-f47a-4093-b987-b885254a351b}, private_publication = {false}, bibtype = {article}, author = {Li, JiamanKuang, Zhengfei and Li, Hao and Zhao, Yajie} }
@article{ title = {PREDATOR: Registration of 3D Point Clouds with Low Overlap}, type = {article}, id = {ad7dff3d-414a-3ff4-96cd-07a1350d007a}, created = {2022-03-28T07:14:31.921Z}, accessed = {2022-03-28}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-12-12T12:06:14.239Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {We introduce PREDATOR, a model for pairwise point-cloud registration with deep attention to the overlap region. Different from previous work, our model is specifically designed to handle (also) point-cloud pairs with low overlap. Its key novelty is an overlap-attention block for early information exchange between the latent encodings of the two point clouds. In this way the subsequent decoding of the latent representations into per-point features is conditioned on the respective other point cloud, and thus can predict which points are not only salient, but also lie in the overlap region between the two point clouds. The ability to focus on points that are relevant for matching greatly improves performance: PREDATOR raises the rate of successful registrations by more than 15 percent points in the low-overlap scenario , and also sets a new state of the art for the 3DMatch benchmark with 90.6% registration recall. [Code release]}, bibtype = {article}, author = {Huang, Shengyu and Gojcic, Zan and Usvyatsov, Mikhail and Wieser, Andreas and Schindler, Konrad and Zurich, Eth} }
@misc{ title = {Adjacency Matrix}, type = {misc}, websites = {https://mathworld.wolfram.com/}, id = {dd4cfd94-914a-3b34-a934-6195c84fb074}, created = {2022-03-28T09:45:06.345Z}, accessed = {2022-03-27}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T09:16:40.118Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, citation_key = {weissteinAdjacencyMatrix}, source_type = {misc}, user_context = {Text}, notes = {Publisher: Wolfram Research, Inc.}, private_publication = {false}, abstract = {The adjacency matrix, sometimes also called the connection matrix, of a simple labeled graph is a matrix with rows and columns labeled by graph vertices, with a 1 or 0 in position (v\_i,v\_j) according to whether v\_i and v\_j are adjacent or not. For a simple graph with no self-loops, the adjacency matrix must have 0s on the diagonal. For an undirected graph, the adjacency matrix is symmetric. The illustration above shows adjacency matrices for particular labelings of the claw graph, cycle...}, bibtype = {misc}, author = {Weisstein, Eric W} }
@article{ title = {Neighborhood-aware Geometric Encoding Network for Point Cloud Registration}, type = {article}, websites = {https://github.com/zhulf0804/NgeNet.}, id = {8a3ea9de-bf68-3085-858e-43ccb10be360}, created = {2022-03-30T07:45:10.872Z}, accessed = {2022-03-30}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-04-01T07:30:02.792Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {a3e73416-fb6e-4382-ad74-08c4dd4c0e94}, private_publication = {false}, abstract = {The distinguishing geometric features determine the success of point cloud registration. However , most point clouds are partially overlapping , corrupted by noise, and comprised of indistinguishable surfaces, which makes it a challenge to extract discriminative features. Here, we propose the Neighborhood-aware Geometric Encoding Network (NgeNet) for accurate point cloud registration. NgeNet utilizes a geometric guided encoding module to take geometric characteristics into consideration, a multi-scale architecture to focus on the semantically rich regions in different scales, and a consistent voting strategy to select features with proper neighborhood size and reject the specious features. The awareness of adaptive neighborhood points is obtained through the multi-scale architecture accompanied by voting. Specifically, the proposed techniques in NgeNet are model-agnostic, which could be easily migrated to other networks. Comprehensive experiments on indoor, outdoor and object-centric synthetic datasets demonstrate that NgeNet surpasses all of the published state-of-the-art methods. The code will be available at https://github.com/zhulf0804/NgeNet.}, bibtype = {article}, author = {Zhu, Lifa and Guan, Haining and Lin, Changwei and Han, Renmin} }
@article{ title = {Position-based Hash Embeddings For Scaling Graph Neural Networks}, type = {article}, id = {779aa7fc-7f3e-30bc-bb5e-e1cecd4aff05}, created = {2022-05-02T08:14:58.379Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:14:58.379Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Kalantzi, Maria and Karypis, George} }
@article{ title = {Distilling Knowledge from Graph Convolutional Networks}, type = {article}, id = {7521640b-70cb-369d-bef4-af182e7f7b33}, created = {2022-05-02T08:14:58.383Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:08.669Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Yang, Yiding and Qiu, Jiayan and Song, Mingli and Tao, Dacheng and Wang, Xinchao} }
@article{ title = {Progressive Point Cloud Deconvolution Generation Network}, type = {article}, keywords = {deconvolution,deep learning,gan,point cloud generation}, id = {e86d9ffc-17cf-376f-bc84-6bbfa239d58f}, created = {2022-05-02T08:14:58.409Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:11.046Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Hui, Le and Xu, Rui and Xie, Jin and Qian, Jianjun and Yang, Jian} }
@article{ title = {$ LUSODQH}, type = {article}, keywords = {classification and segmentation,graph,point cloud,pyramid network}, pages = {1-8}, id = {0a6dfb3b-30fa-3a14-b40d-a6a596a1301a}, created = {2022-05-02T08:14:58.524Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:06.827Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Zhiheng, Kang and Ning, Li} }
@article{ title = {diffConv : Analyzing Irregular Point Clouds with an Irregular View}, type = {article}, id = {44f87b99-195a-340c-81cc-4d98471defe8}, created = {2022-05-02T08:14:58.528Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:19.935Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, citation_key = {Lin}, private_publication = {false}, bibtype = {article}, author = {Lin, Manxi} }
@article{ title = {Weakly Supervised Semantic Point Cloud Segmentation : Towards 10 × Fewer Labels}, type = {article}, id = {1c3beb24-bb11-3acf-91ff-a6d1e6d4ef0f}, created = {2022-05-02T08:14:58.549Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:38.324Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Xu, Xun and Lee, Gim Hee} }
@article{ title = {Generalized Graph Convolutional Networks for Skeleton-based Action Recognition}, type = {article}, id = {6b21d924-86c6-3ad0-a8e5-68e33ae1ccad}, created = {2022-05-02T08:14:58.551Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:33.018Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Gao, Xiang and Hu, Wei and Tang, Jiaxiang and Pan, Pan and Liu, Jiaying and Guo, Zongming} }
@article{ title = {Node Similarity Preserving Graph Convolutional Networks}, type = {article}, id = {a64fcce8-4d81-37c9-bdac-bee0ab395cb1}, created = {2022-05-02T08:14:58.636Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:11.917Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Derr, Tyler and Wang, Yiqi} }
@article{ title = {Towards Efficient Point Cloud Graph Neural Networks Through Architectural Simplification}, type = {article}, id = {c2cdd330-084b-3d39-abcb-a17cd90eee20}, created = {2022-05-02T08:14:58.652Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-05-02T08:15:17.594Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Tailor, Shyam A} }
@article{ title = {3D Local Features for Direct Pairwise Registration}, type = {article}, id = {a49aa3a0-416a-3b84-8a0e-ce66ffc688db}, created = {2022-06-06T05:45:37.821Z}, accessed = {2022-06-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-06T05:47:02.958Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {We present a novel, data driven approach for solving the problem of registration of two point cloud scans. Our approach is direct in the sense that a single pair of corresponding local patches already provides the necessary transformation cue for the global registration. To achieve that, we first endow the state of the art PPF-FoldNet [18] auto-encoder (AE) with a pose-variant sibling, where the discrepancy between the two leads to pose-specific descrip-tors. Based upon this, we introduce RelativeNet, a relative pose estimation network to assign correspondence-specific orientations to the keypoints, eliminating any local reference frame computations. Finally, we devise a simple yet effective hypothesize-and-verify algorithm to quickly use the predictions and align two point sets. Our extensive quantitative and qualitative experiments suggests that our approach outperforms the state of the art in challenging real datasets of pairwise registration and that augmenting the keypoints with local pose information leads to better generalization and a dramatic speed-up.}, bibtype = {article}, author = {Deng, Haowen and Birdal, Tolga and Ilic, Slobodan} }
@article{ title = {Pairwise Point Cloud Registration Using Graph Matching and Rotation-invariant Features}, type = {article}, keywords = {Graph matching,Index Terms-3D descriptor,Point cloud registration,Rotation-invariance}, id = {e8641497-1ff2-3604-b4b1-53451af084f9}, created = {2022-06-06T05:46:36.452Z}, accessed = {2022-06-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-07T04:52:51.400Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {Registration is a fundamental but critical task in point cloud processing, which usually depends on finding element correspondence from two point clouds. However, the finding of reliable correspondence relies on establishing a robust and discriminative description of elements and the correct matching of corresponding elements. In this letter, we develop a coarse-to-fine registration strategy, which utilizes rotation-invariant features and a new weighted graph matching method for iteratively finding correspondence. In the graph matching method, the similarity of nodes and edges in Euclidean and feature space are formulated to construct the optimization function. The proposed strategy is evaluated using two benchmark datasets and compared with several state-of-the-art methods. Regarding the experimental results, our proposed method can achieve a fine registration with rotation errors of less than 0.2 degrees and translation errors of less than 0.1 m.}, bibtype = {article}, author = {Huang, Rong and Yao, Wei and Xu, Yusheng and Ye, Zhen and Stilla, Uwe} }
@article{ title = {LiDAR-based point clouds registration for localization in indoor environments}, type = {article}, websites = {https://tel.archives-ouvertes.fr/tel-03522998}, id = {64596a40-939b-35eb-819f-f2148c931566}, created = {2022-06-06T05:47:40.619Z}, accessed = {2022-06-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-06T05:47:52.935Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, bibtype = {article}, author = {Favre, Ketty} }
@article{ title = {End-to-End 3D Point Cloud Learning for Registration Task Using Virtual Correspondences}, type = {article}, websites = {https://github.com/qiaozhijian/VCR-Net.git}, id = {e019da86-4722-317d-8448-e0ee73ced043}, created = {2022-06-06T05:48:07.617Z}, accessed = {2022-06-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-07T04:52:51.375Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {3D Point cloud registration is still a very challenging topic due to the difficulty in finding the rigid transformation between two point clouds with partial correspondences, and it's even harder in the absence of any initial estimation information. In this paper, we present an end-to-end deep-learning based approach to resolve the point cloud registration problem. Firstly, the revised LPD-Net is introduced to extract features and aggregate them with the graph network. Secondly, the self-attention mechanism is utilized to enhance the structure information in the point cloud and the cross-attention mechanism is designed to enhance the corresponding information between the two input point clouds. Based on which, the virtual corresponding points can be generated by a soft pointer based method, and finally, the point cloud registration problem can be solved by implementing the SVD method. Comparison results in ModelNet40 dataset validate that the proposed approach reaches the state-of-the-art in point cloud registration tasks and experiment resutls in KITTI dataset validate the effectiveness of the proposed approach in real applications.Our source code is available at https://github.com/qiaozhijian/VCR-Net.git}, bibtype = {article}, author = {Qiao, Zhijian and Wei, Huanshu and Liu, Zhe and Suo, Chuanzhe and Wang, Hesheng} }
@article{ title = {OverlapNet: Loop Closing for LiDAR-based SLAM}, type = {article}, websites = {https://github.com/PRBonn/OverlapNet}, id = {db57fc0c-dbcb-3908-a091-8c9a946e3934}, created = {2022-06-06T05:49:53.595Z}, accessed = {2022-06-06}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-06T05:49:58.425Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {6075c81a-cded-4bc7-822e-6d5f3181ca0d}, private_publication = {false}, abstract = {Simultaneous localization and mapping (SLAM) is a fundamental capability required by most autonomous systems. In this paper, we address the problem of loop closing for SLAM based on 3D laser scans recorded by autonomous cars. Our approach utilizes a deep neural network exploiting different cues generated from LiDAR data for finding loop closures. It estimates an image overlap generalized to range images and provides a relative yaw angle estimate between pairs of scans. Based on such predictions, we tackle loop closure detection and integrate our approach into an existing SLAM system to improve its mapping results. We evaluate our approach on sequences of the KITTI odometry benchmark and the Ford campus dataset. We show that our method can effectively detect loop closures surpassing the detection performance of state-of-the-art methods. To highlight the generalization capabilities of our approach, we evaluate our model on the Ford campus dataset while using only KITTI for training. The experiments show that the learned representation is able to provide reliable loop closure candidates, also in unseen environments.}, bibtype = {article}, author = {Chen, Xieyuanli and Läbe, Thomas and Milioto, Andres and Röhling, Timo and Vysotska, Olga and Haag, Alexandre and Behley, Jens and Stachniss, Cyrill} }
@article{ title = {SLAM-Loop Closing with Visually Salient Features}, type = {article}, keywords = {Index Terms-Mobile Robotics,Loop Closing,SLAM,Saliency,Visual Features}, id = {96f45fe9-99c0-3d61-a204-712119ac390e}, created = {2022-06-23T14:30:47.751Z}, accessed = {2022-06-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-24T06:05:02.373Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {Within the context of Simultaneous Localisation and Mapping (SLAM), "loop closing" is the task of deciding whether or not a vehicle has, after an excursion of arbitrary length, returned to a previously visited area. Reliable loop closing is both essential and hard. It is without doubt one of the greatest impediments to long term, robust SLAM. This paper illustrates how visual features, used in conjunction with scanning laser data, can be used to a great advantage. We use the notion of visual saliency to focus the selection of suitable (affine invariant) image-feature descriptors for storage in a database. When queried with a recently taken image the database returns the capture time of matching images. This time information is used to discover loop closing events. Crucially this is achieved independently of estimated map and vehicle location. We integrate the above technique into a SLAM algorithm using delayed vehicle states and scan matching to form interpose geometric constraints. We present initial results using this system to close loops (around 100m) in an indoor environment.}, bibtype = {article}, author = {Newman, Paul and Ho, Kin} }
@article{ title = {Loop Closure Detection with RGB-D Feature Pyramid Siamese Networks}, type = {article}, id = {bced8b54-4e4d-3e17-a29b-c8ef9448b74a}, created = {2022-06-23T14:31:50.936Z}, accessed = {2022-06-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-06-24T06:05:02.380Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {In visual Simultaneous Localization And Mapping (SLAM), detecting loop closures has been an important but difficult task. Currently, most solutions are based on the bag-of-words approach. Yet the possibility of deep neural network application to this task has not been fully explored due to the lack of appropriate architecture design and of sufficient training data. In this paper we demonstrate the applicability of deep neural networks by addressing both issues. Specifically we show that a feature pyramid Siamese neural network can achieve state-of-the-art performance on pairwise loop closure detection. The network is trained and tested on large-scale RGB-D datasets with a novel automatic loop closure labeling algorithm. Each image pair is labelled by how much the images overlap, allowing loop closure to be computed directly rather than by labor intensive manual labeling. We present an algorithm to adopt any large-scale generic RGB-D dataset for use in training deep loop-closure networks. We show for the first time that deep neural networks are capable of detecting loop closures, and we provide a method for generating large-scale datasets for use in evaluating and training loop closure detectors.}, bibtype = {article}, author = {Qianhao, Zhang and Mai, Alexander and Menke, Joseph and Yang, Allen} }
@article{ title = {Fast and Effective Loop Closure Detection to Improve SLAM Performance}, type = {article}, keywords = {Histogram,Keypoint matching,Loop closure,Place recognition,SLAM}, websites = {https://doi.org/10.1007/s10846-017-0718-z}, id = {6ed51489-b907-34fe-b517-1e15c468aa92}, created = {2022-06-23T14:32:50.369Z}, accessed = {2022-06-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-26T06:52:35.909Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {A fundamental component of simultaneous local-ization and mapping systems is loop closure detection. For consistent mapping, accurate loop closure detection is crucial to reduce the drift of the estimated trajectory. As the map size increases, loop closure detection performance becomes more critical, but it gets harder and needs more computational time to find correct loop closure candidates. This paper presents an extension to a state-of-the-art RGB-D SLAM system to increase accuracy of large-scale mapping in real-time. The proposed extension uses a straightforward visual place recognition method to determine loop closure candidates. The method combines global and local image features through employing image histograms and keypoint matching. Four different place recognition techniques composed of complementary steps of the method are studied: histogram only, brute-force key-point matching, hierarchical clustering, and adaptive thresh-olding. The extended RGB-D SLAM system is assessed on a popular dataset in terms of accuracy and speed. The quantitative results show that the proposed method improves accuracy up to ∼42% and works fast enough to meet real-A conference version of this paper is presented at ICARSC 2016. time requirements. The method enables to perform real-time large-scale indoor mapping effectively on CPU.}, bibtype = {article}, author = {Guclu, Oguzhan and Can, Ahmet Burak}, doi = {10.1007/s10846-017-0718-z} }
@article{ title = {Loop Closure Detection with RGB-D Feature Pyramid Siamese Networks}, type = {article}, id = {259eb030-53ab-31e0-9fc5-f9131fb06f93}, created = {2022-06-23T14:56:05.462Z}, accessed = {2022-06-23}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-26T06:52:36.134Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {In visual Simultaneous Localization And Mapping (SLAM), detecting loop closures has been an important but difficult task. Currently, most solutions are based on the bag-of-words approach. Yet the possibility of deep neural network application to this task has not been fully explored due to the lack of appropriate architecture design and of sufficient training data. In this paper we demonstrate the applicability of deep neural networks by addressing both issues. Specifically we show that a feature pyramid Siamese neural network can achieve state-of-the-art performance on pairwise loop closure detection. The network is trained and tested on large-scale RGB-D datasets with a novel automatic loop closure labeling algorithm. Each image pair is labelled by how much the images overlap, allowing loop closure to be computed directly rather than by labor intensive manual labeling. We present an algorithm to adopt any large-scale generic RGB-D dataset for use in training deep loop-closure networks. We show for the first time that deep neural networks are capable of detecting loop closures, and we provide a method for generating large-scale datasets for use in evaluating and training loop closure detectors.}, bibtype = {article}, author = {Qianhao, Zhang and Mai, Alexander and Menke, Joseph and Yang, Allen} }
@article{ title = {A Benchmark for the Evaluation of RGB-D SLAM Systems}, type = {article}, websites = {http://vision.in.tum.de/data/datasets/}, id = {9bc6ff8a-47d3-3f8e-a7f8-e622ede7af2d}, created = {2022-07-05T15:07:09.924Z}, accessed = {2022-07-05}, file_attached = {true}, profile_id = {48fc0258-023d-3602-860e-824092d62c56}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-05T15:07:12.650Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {ed605795-f522-465c-a0b8-5f8a05f7fd5f}, private_publication = {false}, abstract = {In this paper, we present a novel benchmark for the evaluation of RGB-D SLAM systems. We recorded a large set of image sequences from a Microsoft Kinect with highly accurate and time-synchronized ground truth camera poses from a motion capture system. The sequences contain both the color and depth images in full sensor resolution (640 × 480) at video frame rate (30 Hz). The ground-truth trajectory was obtained from a motion-capture system with eight high-speed tracking cameras (100 Hz). The dataset consists of 39 sequences that were recorded in an office environment and an industrial hall. The dataset covers a large variety of scenes and camera motions. We provide sequences for debugging with slow motions as well as longer trajectories with and without loop closures. Most sequences were recorded from a handheld Kinect with unconstrained 6-DOF motions but we also provide sequences from a Kinect mounted on a Pioneer 3 robot that was manually navigated through a cluttered indoor environment. To stimulate the comparison of different approaches, we provide automatic evaluation tools both for the evaluation of drift of visual odometry systems and the global pose error of SLAM systems. The benchmark website [1] contains all data, detailed descriptions of the scenes, specifications of the data formats, sample code, and evaluation tools.}, bibtype = {article}, author = {Sturm, Jürgen and Engelhard, Nikolas and Endres, Felix and Burgard, Wolfram and Cremers, Daniel} }
@article{ title = {VG-VAE: A Venatus Geometry Point-Cloud Variational Auto-Encoder}, type = {article}, pages = {2978-2985}, id = {86f76899-c824-3285-bfa4-a26b2eb22a4f}, created = {2022-07-22T12:20:03.047Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-22T14:19:35.013Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, abstract = {Figure 1. VG-VAE is a variational immanent method to learn Hierarchical intrinsic, local and global geometric signatures. Geometric signatures aid in the comprehension of pointcloud morphology. Understanding morphology is equivalent to unearthing Poincloud's creation, and is observed in this Figure. Grey represents input pointcloud, the next Coloured pointcloud represents geometric signatures G, and the highlighted pointcloud represents geometric labels G. Abstract In this paper, we propose VG-VAE: Venatus Geometric Variational Auto-Encoder for capturing unsupervised hierarchical local and global geometric signatures in point-cloud. Recent research emphasises the significance of the underlying intrinsic geometry for pointcloud processing. Our contribution is to extract and analyse the morphology of the pointcloud using the proposed Geometric Proximity Correlator (GPC) and variational sampling of the latent. The extraction of local geometric signatures is facilitated by the GPC, whereas the extraction of global geometry is facilitated by variational sampling. Furthermore, we apply a naive mix of vector algebra and 3D geometry to extract the basic per-point geometric signature, which assists the unsupervised hypothesis. We provide statistical analyses of local and global geometric signatures. The impacts of our geometric features are demonstrated on pointcloud classification as downstream task using the classic pointcloud feature extractor PointNet. We demonstrate our analysis on ModelNet40 a benchmark dataset, and compare with state-of-the-art techniques.}, bibtype = {article}, author = {Anvekar, Tejas and Tabib, Ramesh Ashok and Hegde, Dikshit} }
@article{ title = {Point-BERT : Pre-training 3D Point Cloud Transformers with Masked Point Modeling}, type = {article}, pages = {19313-19322}, id = {3d2fda0e-a3ed-3000-b410-5258a80a3db8}, created = {2022-07-22T14:19:34.778Z}, file_attached = {true}, profile_id = {235249c2-3ed4-314a-b309-b1ea0330f5d9}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-22T15:59:26.332Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1853f94b-7af1-40fa-b068-4758e9a02bc4}, private_publication = {false}, bibtype = {article}, author = {Yu, Xumin and Tang, Lulu and Rao, Yongming and Huang, Tiejun and Zhou, Jie and Lu, Jiwen} }
@article{ title = {The Devil is in the Pose: Ambiguity-free 3D Rotation-invariant Learning via Pose-aware Convolution}, type = {article}, pages = {7472-7481}, id = {8f1bc82f-5bc5-3db8-93f6-022a40cc1467}, created = {2022-07-28T12:39:24.533Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-07-28T12:39:34.212Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, folder_uuids = {353ce2e2-5e70-48e5-951f-78dc31fa40d2}, private_publication = {false}, abstract = {Recent progress in introducing rotation invariance (RI) to 3D deep learning methods is mainly made by designing RI features to replace 3D coordinates as input. The key to this strategy lies in how to restore the global information that is lost by the input RI features. Most state-of-the-arts achieve this by incurring additional blocks or complex global representations, which is time-consuming and ineffective. In this paper, we real that the global information loss stems from an unexplored pose information loss problem , i.e., common convolution layers cannot capture the relative poses between RI features, thus hindering the global information to be hierarchically aggregated in the deep networks. To address this problem, we develop a Pose-aware Rotation Invariant Convolution (i.e., PaRI-Conv), which dynamically adapts its kernels based on the relative poses. Specifically, in each PaRI-Conv layer, a lightweight Augmented Point Pair Feature (APPF) is designed to fully encode the RI relative pose information. Then, we propose to synthesize a factorized dynamic kernel, which reduces the computational cost and memory burden by decomposing it into a shared basis matrix and a pose-aware diagonal matrix that can be learned from the APPF. Extensive experiments on shape classification and part segmentation tasks show that our PaRI-Conv surpasses the state-of-the-art RI methods while being more compact and efficient.}, bibtype = {article}, author = {Chen, Ronghan and Cong, Yang} }
@article{ title = {Exploring the Devil in Graph Spectral Domain}, type = {article}, keywords = {adversarial attack,graph spectral domain,point cloud}, id = {004fdfe4-ec4e-3b86-a63e-b210edff7608}, created = {2022-09-01T14:14:15.543Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-10-03T13:31:10.764Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {8f8f5505-1a28-42dd-a82c-92b5738465f1,244f8db2-6bd4-47d9-8abf-425a263fd4d1,8c0c38c1-fb54-417d-a224-97fc3b1afba9}, private_publication = {false}, bibtype = {article}, author = {Attacks, Cloud and Hu, Qianjiang and Liu, Daizong and Hu, Wei}, number = {128} }
@article{ title = {Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm arXiv : 1712 . 01815v1 [ cs . AI ] 5 Dec 2017}, type = {article}, pages = {1-19}, id = {c40a5f1a-0eb8-39dc-86ae-606c63df645a}, created = {2022-09-06T11:37:21.116Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-06T11:37:27.024Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, private_publication = {false}, bibtype = {article}, author = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and Lillicrap, Timothy and Simonyan, Karen and Hassabis, Demis} }
@misc{ title = {LARGE SCALE GAN TRAINING FOR HIGH FIDELITY NATURAL IMAGE SYNTHESIS}, type = {misc}, id = {737c1387-1a58-3daa-9eda-d7db5f72257e}, created = {2022-09-08T10:49:09.775Z}, file_attached = {false}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2022-09-08T10:49:10.282Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {034ae31f-a548-45de-8507-3cbbc9e326ad}, private_publication = {false}, bibtype = {misc}, author = {Andrew Brock, Jeff Donahue and Simonyan, Karen} }
@article{ title = {Learning Accurate 3D Shape Based on Stereo Polarimetric Imaging}, type = {article}, pages = {17287-17296}, id = {f3ad76a8-1e4f-3549-875e-bd30ee761e53}, created = {2023-06-22T10:06:22.556Z}, file_attached = {true}, profile_id = {ad172e55-c0e8-3aa4-8465-09fac4d5f5c8}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-06-22T10:06:58.010Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {cbcad584-0c50-48fe-a0d7-5b4c781cba83}, private_publication = {false}, bibtype = {article}, author = {Huang, Tianyu and Li, Haoang and He, Kejing and Sui, Congying and Li, Bin} }
@book{ title = {Proceedings of the 2018 19th International Carpathian Control Conference (ICCC) : La Contessa Castle Hotel, Szilvásvárad, Hungary, May 28-31, 2018}, type = {book}, id = {a50dbb73-5020-3f97-a5cd-9ccbc9bc29f2}, created = {2023-11-07T09:47:16.795Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-01-09T14:24:05.015Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {"IEEE Part Number: CFP1842L-ART."}, bibtype = {book}, author = {Drótos, Dániel and Miskolci Egyetem (Hungary). Institute of Automation and Infocommunication, undefined and IEEE Industry Applications Society, undefined and Institute of Electrical and Electronics Engineers, undefined} }
@techreport{ title = {Why is FPGA-GPU Heterogeneity the Best Option for Embedded Deep Neural Networks?}, type = {techreport}, id = {45075a41-e32b-367e-aee0-1e351ea07fb2}, created = {2023-11-07T10:04:14.557Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2023-12-06T13:14:54.748Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, notes = {Abtract: <br/><br/><b>Direct Hardware Mapping for CNN on FPGAs</b> <br/>Advantage of FPGA-GPU hydrid: <b>outperfroms GPU around 20% better energy comption and latency.</b> <br/><br/>The DHM: fixed-point computation approach <b>(8-bit)</b>Hardware architecture specificially important for enbedded systems: - show viable alternative of FPGA for small layers DL accelarators - compare <br/><br/>FPGA with enbedded GPU and show effiency of hybrid approach The DHM requires high usage of resources in FPGAs, only used for small designs, might be usefull to use GPU for reduce memory accesses. <br/>FPGAs limit the depth of convolution filters that can be directly mapped. Shows table for images: 224x224x3, FGPA outperforms GPU significantly for small number of of filters and sizes. <br/><br/>Method and Materials: <br/><br/>Nvidia <b>Jetson TX2</b> CPU-GPU board Intel <b>Cyclone10GX FPGA</b>CNNs: SqueezeNet, MobileNetv2, and ShuffleNetv2 ImageNet pre-trained mobile CNN models were obtained from <b>Pytorch</b> <b>Power Estimation tool </b>from Intel Quartus Pro Edition}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {Graphics Processing Units (GPUs) are currently the dominating programmable architecture for Deep Learning (DL) accelerators. The adoption of Field Programmable Gate Arrays (FPGAs) in DL accelerators is however getting momentum. In this paper, we demonstrate that Direct Hardware Mapping (DHM) of a Convolutional Neural Network (CNN) on an embedded FPGA substantially outperforms a GPU implementation in terms of energy efficiency and execution time. However, DHM is highly resource intensive and cannot fully substitute the GPU when implementing a state-of-the-art CNN. We thus propose a hybrid FPGA-GPU DL acceleration method and demonstrate that heterogeneous acceleration outperforms GPU acceleration even including communication overheads. Experimental results are conducted on a heterogeneous multi-platform setup embedding an Nvidia® Jetson TX2 CPU-GPU board and an Intel® Cyclone10GX FPGA board. The SqueezeNet, MobileNetv2, and ShuffleNetv2 mobile-oriented CNNs are experimented. We show that heterogeneous FPGA-GPU acceleration outperforms GPU acceleration for classification inference task over MobileNetv2 (12%-30% energy reduction, 4% to 26% latency reduction), SqueezeNet (21%-28% energy reduction , same latency), and ShuffleNetv2 (25% energy reduction, 21% latency reduction). I. INTRODUCTION Internet of Things (IoT) and the emerging adoption of heterogeneous architectures in edge devices are currently extending the possibilities of Deep Learning (DL)-powered applications. Indeed, in order to keep reasonable device energy consumption, embedded platforms have started to adopt heterogeneous architectures to keep up with an ever-growing computational demand. While GPUs currently dominate programmable DL acceleration, state-of-the art is still divided on deciding in which cases an FPGA outperforms a GPU as an efficient DL hardware substrate. The main motivation for heterogeneous solutions is to increase computational efficiency through acceleration for a subset of tasks on a full workflow. However, this gain does not mean that the communication overheads induced by inter-layer transfers can be compensated. In this paper, we propose and evaluate FPGA and GPU DL module implementations separately against a heterogeneous solution. Comparisons are based on widely used CNN building blocks using a throughput-optimised pipe-lined Direct Hardware Mapping (DHM) technique for FPGA CNN kernels deployment [1]. The DHM technique incorporates several differences in comparison with conventional GPU network execution. The first difference is the}, bibtype = {techreport}, author = {Carballo-Hernández, Walther and Pelcat, Maxime and Berry, François} }
@book{ title = {2019 IEEE International Conference on Embedded Software and Systems (ICESS).}, type = {book}, id = {189aff2f-f856-310e-9bf6-f450be9808d2}, created = {2023-11-22T14:21:15.592Z}, file_attached = {true}, profile_id = {78e67dcc-28e6-3300-a4ed-85434b13f01f}, group_id = {1ff583c0-be37-34fa-9c04-73c69437d354}, last_modified = {2024-01-09T14:39:28.909Z}, read = {true}, starred = {false}, authored = {false}, confirmed = {false}, hidden = {false}, folder_uuids = {1bffc8fa-4e6e-48c8-b694-323af7fbd0e8}, private_publication = {false}, abstract = {"Proceedings of a meeting held 2-3 June 2019, Las Vegas, Nevada, USA"--Proceedings.com website}, bibtype = {book}, author = {Institute of Electrical and Electronics Engineers, undefined and IEEE Computer Society, undefined} }