@inproceedings{8b3d860bc53749fdb583d6efae181617,
title = "VENTRILOQUIST-NET: LEVERAGING SPEECH CUES FOR EMOTIVE TALKING HEAD GENERATION",
abstract = "In this paper, we propose Ventriloquist-Net: A Talking Head Generation model that uses only a speech segment and a single source face image. It places emphasis on emotive expressions. Cues for generating these expressions are implicitly inferred from the speech clip only. We formulate our framework to comprise of independently trained modules to expedite convergence. This not only allows extension to datasets in a semi-supervised manner but also facilitates handling in-the-wild source images. Quantitative and qualitative evaluations on generated videos demonstrate state-of-the-art performance even on unseen input data. Implementation and supplementary videos are available at https://github.com/dipnds/VentriloquistNet.",
keywords = "Speech Emotion, Talking Head Generation",
author = "Deepan Das and Qadeer Khan and Daniel Cremers",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 29th IEEE International Conference on Image Processing, ICIP 2022 ; Conference date: 16-10-2022 Through 19-10-2022",
year = "2022",
doi = "10.1109/ICIP46576.2022.9897657",
language = "English",
series = "Proceedings - International Conference on Image Processing, ICIP",
publisher = "IEEE Computer Society",
pages = "1716--1720",
booktitle = "2022 IEEE International Conference on Image Processing, ICIP 2022 - Proceedings",
}