@inproceedings{e7526ca0917245638353269b0f367b3b,
title = "Transformers Pay Attention to Convolutions Leveraging Emerging Properties of ViTs by Dual Attention-Image Network",
abstract = "Although purely transformer-based architectures pretrained on large datasets are introduced as foundation models for general computer vision tasks, hybrid models that incorporate combinations of convolution and transformer blocks showed state-of-the-art performance in more specialized tasks. Nevertheless, despite the performance gain of both pure and hybrid transformer-based architectures compared to convolutional networks, their high training cost and complexity make it challenging to use them in real scenarios. In this work, we propose a novel and simple architecture based on only convolutional layers and show that by just taking advantage of the attention map visualizations obtained from a self-supervised pretrained vision transformer network, complex transformer-based networks, and even 3D architectures are outperformed with much fewer computation costs. The proposed architecture is composed of two encoder branches with the original image as input in one branch and the attention map visualizations of the same image from multiple self-attention heads from a pre-trained DINO model in the other branch. The results of our experiments on medical imaging datasets show that the extracted attention map visualizations from the attention heads of a pre-trained transformer architecture combined with the image provide strong prior knowledge for a pure CNN architecture to outperform CNN-based and transformer-based architectures. Project Page: dai-net.github.io",
keywords = "Attention Map, Medical Imaging, Segmentation, Transformers",
author = "Yousef Yeganeh and Azade Farshad and Peter Weinberger and Ahmadi, {Seyed Ahmad} and Ehsan Adeli and Nassir Navab",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 IEEE/CVF International Conference on Computer Vision Workshops, ICCVW 2023 ; Conference date: 02-10-2023 Through 06-10-2023",
year = "2023",
doi = "10.1109/ICCVW60793.2023.00244",
language = "English",
series = "Proceedings - 2023 IEEE/CVF International Conference on Computer Vision Workshops, ICCVW 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "2296--2307",
booktitle = "Proceedings - 2023 IEEE/CVF International Conference on Computer Vision Workshops, ICCVW 2023",
}