@inproceedings{412711df70064173a6cb895c7e5d522b,
title = "Lightweight end-to-end speech recognition from raw audio data using sinc-convolutions",
abstract = "Many end-to-end Automatic Speech Recognition (ASR) systems still rely on pre-processed frequency-domain features that are handcrafted to emulate the human hearing. Our work is motivated by recent advances in integrated learnable feature extraction. For this, we propose Lightweight Sinc-Convolutions (LSC) that integrate Sinc-convolutions with depthwise convolutions as a low-parameter machine-learnable feature extraction for end-to-end ASR systems. We integrated LSC into the hybrid CTC/attention architecture for evaluation. The resulting end-to-end model shows smooth convergence behaviour that is further improved by applying SpecAugment in the time domain. We also discuss filter-level improvements, such as using log-compression as activation function. Our model achieves a word error rate of 10.7% on the TEDlium v2 test dataset, surpassing the corresponding architecture with log-mel filterbank features by an absolute 1.9%, but only has 21% of its model size.",
keywords = "Attention-based neural networks, Sinc convolutions, Speech recognition",
author = "Ludwig K{\"u}rzinger and Nicolas Lindae and Palle Klewitz and Gerhard Rigoll",
note = "Publisher Copyright: Copyright {\textcopyright} 2020 ISCA; 21st Annual Conference of the International Speech Communication Association, INTERSPEECH 2020 ; Conference date: 25-10-2020 Through 29-10-2020",
year = "2020",
doi = "10.21437/Interspeech.2020-1392",
language = "English",
isbn = "9781713820697",
series = "Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH",
publisher = "International Speech Communication Association",
pages = "1659--1663",
booktitle = "Interspeech 2020",
}