@inproceedings{690e60c59c654d3b92a2c0df5f49cc85,
title = "MonoTCM: Semantic-Depth Fusion Transformer for Monocular 3D Object Detection with Token Clustering and Merging",
abstract = "Monocular 3D object detection presents significant challenges due to the inherent absence of depth and geometric information, rendering it more complex than 2D detection. This paper introduces MonoTCM, a Semantic-Depth Fusion Transformer that leverages a Token Clustering and Merging (TCM) module to enhance the efficiency and accuracy of monocular 3D object detection. The TCM module aggregates multi-scale grid-based tokens into clustering-based tokens, dynamically adjusting their shapes and sizes based on local density and distance metrics. This allows for finer granularity in critical areas while consolidating less informative regions. The aggregated tokens are subsequently decomposed into semantic and depth features, processed through dedicated transformer-based encoders, and integrated using a semantic-depth fusion decoder modeled after DETR. This approach enhances the model{\textquoteright}s ability to capture implicit global geometric information and provides a cost-effective solution for real-time intelligent driving applications. Experimental results demonstrate the superiority of MonoTCM in enhancing detection performance compared to other advanced methods, highlighting its potential to advance the field of monocular 3D object detection.",
keywords = "Computer Vision, Depth Estimation, Monocluar 3D Object Detection",
author = "Changyu Zeng and Zimu Wang and Jimin Xiao and Anh Nguyen and Kaizhu Huang and Wei Wang and Yutao Yue",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2026.; 31st International Conference on Neural Information Processing, ICONIP 2024 ; Conference date: 02-12-2024 Through 06-12-2024",
year = "2026",
doi = "10.1007/978-981-96-7036-9\_22",
language = "English",
isbn = "9789819670352",
series = "Communications in Computer and Information Science",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "332--346",
editor = "Mufti Mahmud and Maryam Doborjeh and Zohreh Doborjeh and Kevin Wong and Leung, \{Andrew Chi Sing\} and M. Tanveer",
booktitle = "Neural Information Processing - 31st International Conference, ICONIP 2024, Proceedings",
}