@inproceedings{584a4019c83c42fcbb67b33f66a172e7,
title = "StableTTS: Towards Efficient Denoising Acoustic Decoder for Text to Speech Synthesis with Consistency Flow Matching",
abstract = "Current state-of-the-art text-to-speech (TTS) systems predominantly utilize denoising-based acoustic decoders with language models (LLMs) or with non-autoregressive front-ends, known for their superior performance in generating high-fidelity spectrum. In this study, we introduce an efficient TTS system that incorporates Consistency Flow Matching denoising training. This training approach significantly enhances the training efficiency and operational performance of denoising-based acoustic decoders in existing TTS or voice conversion systems, with no additional cost in the training process - a free lunch. To efficiently compare with other denoising strategies, we align with the latest advancements in the implementation of non-autoregressive-based TTS systems and build an efficient DiT-based TTS architecture. Our comprehensive evaluations against various denoising-based methods affirm the efficiency of our proposed system.",
keywords = "component, formatting, insert, style, styling",
author = "Zhiyong Chen and Xinnuo Li and Shuhang Wu and Zhi Yang and Zhiqi Ai and Shugong Xu",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops, ICASSPW 2025 ; Conference date: 06-04-2025 Through 11-04-2025",
year = "2025",
doi = "10.1109/ICASSPW65056.2025.11011026",
language = "English",
series = "2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops, ICASSPW 2025 - Workshop Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops, ICASSPW 2025 - Workshop Proceedings",
}