@inproceedings{063dd99742e74b8bad85c1869dd74f1e,
title = "ScanRefer: 3D Object Localization in RGB-D Scans Using Natural Language",
abstract = "We introduce the task of 3D object localization in RGB-D scans using natural language descriptions. As input, we assume a point cloud of a scanned 3D scene along with a free-form description of a specified target object. To address this task, we propose ScanRefer, learning a fused descriptor from 3D object proposals and encoded sentence embeddings. This fused descriptor correlates language expressions with geometric features, enabling regression of the 3D bounding box of a target object. We also introduce the ScanRefer dataset, containing 51, 583 descriptions of 11, 046 objects from 800 ScanNet[8] scenes. ScanRefer is the first large-scale effort to perform object localization via natural language expression directly in 3D (Code: https://daveredrum.github.io/ScanRefer/).",
author = "Chen, {Dave Zhenyu} and Chang, {Angel X.} and Matthias Nie{\ss}ner",
note = "Publisher Copyright: {\textcopyright} 2020, Springer Nature Switzerland AG.; 16th European Conference on Computer Vision, ECCV 2020 ; Conference date: 23-08-2020 Through 28-08-2020",
year = "2020",
doi = "10.1007/978-3-030-58565-5_13",
language = "English",
isbn = "9783030585648",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "202--221",
editor = "Andrea Vedaldi and Horst Bischof and Thomas Brox and Jan-Michael Frahm",
booktitle = "Computer Vision – ECCV 2020 - 16th European Conference 2020, Proceedings",
}