@inproceedings{6162fa07734b42a785a58b691214e479,
title = "StyleFusion TTS: Multimodal Style-Control and Enhanced Feature Fusion for Zero-Shot Text-to-Speech Synthesis",
abstract = "We introduce StyleFusion-TTS, a prompt and/or audio referenced, style- and speaker-controllable, zero-shot text-to-speech (TTS) synthesis system designed to enhance the editability and naturalness of current research literature. We propose a general front-end encoder as a compact and effective module to utilize multimodal inputs-including text prompts, audio references, and speaker timbre references-in a fully zero-shot manner and produce disentangled style and speaker control embeddings. Our novel approach also leverages a hierarchical conformer structure for the fusion of style and speaker control embeddings, aiming to achieve optimal feature fusion within the current advanced TTS architecture. StyleFusion-TTS is evaluated through multiple metrics, both subjectively and objectively. The system shows promising performance across our evaluations, suggesting its potential to contribute to the advancement of the field of zero-shot text-to-speech synthesis. A project website provides detailed information for demonstration and reproduction.",
keywords = "Multimodal learning, Text-to-speech synthesis, Voice cloning, Zero-shot learning",
author = "Zhiyong Chen and Xinnuo Li and Zhiqi Ai and Shugong Xu",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2025.; 7th Chinese Conference on Pattern Recognition and Computer Vision, PRCV 2024 ; Conference date: 18-10-2024 Through 20-10-2024",
year = "2025",
doi = "10.1007/978-981-97-8795-1_18",
language = "English",
isbn = "9789819787944",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "263--277",
editor = "Zhouchen Lin and Hongbin Zha and Ming-Ming Cheng and Ran He and Cheng-Lin Liu and Kurban Ubul and Wushouer Silamu and Jie Zhou",
booktitle = "Pattern Recognition and Computer Vision - 7th Chinese Conference, PRCV 2024, Proceedings",
}