
@Article{cmc.2025.069482,
AUTHOR = {Xuanhong Wang, Hongyu Guo, Jiazhen Li, Mingchen Wang, Xian Wang, Yijun Zhang},
TITLE = {CAFE-GAN: CLIP-Projected GAN with Attention-Aware Generation and Multi-Scale Discrimination},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {86},
YEAR = {2026},
NUMBER = {1},
PAGES = {1--19},
URL = {http://www.techscience.com/cmc/v86n1/64467},
ISSN = {1546-2226},
ABSTRACT = {Over the past decade, large-scale pre-trained autoregressive and diffusion models rejuvenated the field of text-guided image generation. However, these models require enormous datasets and parameters, and their multi-step generation processes are often inefficient and difficult to control. To address these challenges, we propose CAFE-GAN, a CLIP-Projected GAN with Attention-Aware Generation and Multi-Scale Discrimination, which incorporates a pre-trained CLIP model along with several key architectural innovations. First, we embed a coordinate attention mechanism into the generator to capture long-range dependencies and enhance feature representation. Second, we introduce a trainable linear projection layer after the CLIP text encoder, which aligns textual embeddings with the generator’s semantic space. Third, we design a multi-scale discriminator that leverages pre-trained visual features and integrates a feature regularization strategy, thereby improving training stability and discrimination performance. Experiments on the CUB and COCO datasets demonstrate that CAFE-GAN outperforms existing text-to-image generation methods, achieving lower Fréchet Inception Distance (FID) scores and generating images with superior visual quality and semantic fidelity, with FID scores of 9.84 and 5.62 on the CUB and COCO datasets, respectively, surpassing current state-of-the-art text-to-image models by varying degrees. These findings offer valuable insights for future research on efficient, controllable text-to-image synthesis.},
DOI = {10.32604/cmc.2025.069482}
}



