
@Article{cmc.2024.048703,
AUTHOR = {Jingyi Mao, Yuchen Zhou, Yifan Wang, Junyu Li, Ziqing Liu, Fanliang Bu},
TITLE = {Attention-Enhanced Voice Portrait Model Using Generative Adversarial Network},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {79},
YEAR = {2024},
NUMBER = {1},
PAGES = {837--855},
URL = {http://www.techscience.com/cmc/v79n1/56320},
ISSN = {1546-2226},
ABSTRACT = {Voice portrait technology has explored and established the relationship between speakers’ voices and their facial features, aiming to generate corresponding facial characteristics by providing the voice of an unknown speaker. Due to its powerful advantages in image generation, Generative Adversarial Networks (GANs) have now been widely applied across various fields. The existing Voice2Face methods for voice portraits are primarily based on GANs trained on voice-face paired datasets. However, voice portrait models solely constructed on GANs face limitations in image generation quality and struggle to maintain facial similarity. Additionally, the training process is relatively unstable, thereby affecting the overall generative performance of the model. To overcome the above challenges, we propose a novel deep Generative Adversarial Network model for audio-visual synthesis, named AVP-GAN (Attention-enhanced Voice Portrait Model using Generative Adversarial Network). This model is based on a convolutional attention mechanism and is capable of generating corresponding facial images from the voice of an unknown speaker. Firstly, to address the issue of training instability, we integrate convolutional neural networks with deep GANs. In the network architecture, we apply spectral normalization to constrain the variation of the discriminator, preventing issues such as mode collapse. Secondly, to enhance the model’s ability to extract relevant features between the two modalities, we propose a voice portrait model based on convolutional attention. This model learns the mapping relationship between voice and facial features in a common space from both channel and spatial dimensions independently. Thirdly, to enhance the quality of generated faces, we have incorporated a degradation removal module and utilized pretrained facial GANs as facial priors to repair and enhance the clarity of the generated facial images. Experimental results demonstrate that our AVP-GAN achieved a cosine similarity of 0.511, outperforming the performance of our comparison model, and effectively achieved the generation of high-quality facial images corresponding to a speaker’s voice.},
DOI = {10.32604/cmc.2024.048703}
}



