@Article{cmc.2025.065872,
AUTHOR = {Komal Rani Narejo, Hongying Zan, Kheem Parkash Dharmani, Orken Mamyrbayev, Ainur Akhmediyarova, Zhibek Alibiyeva, Janna Alimkulova},
TITLE = {Optimizing Sentiment Integration in Image Captioning Using Transformer-Based Fusion Strategies},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {84},
YEAR = {2025},
NUMBER = {2},
PAGES = {3407--3429},
URL = {http://www.techscience.com/cmc/v84n2/62924},
ISSN = {1546-2226},
ABSTRACT = {While automatic image captioning systems have made notable progress in the past few years, generating captions that fully convey sentiment remains a considerable challenge. Although existing models achieve strong performance in visual recognition and factual description, they often fail to account for the emotional context that is naturally present in human-generated captions. To address this gap, we propose the Sentiment-Driven Caption Generator (SDCG), which combines transformer-based visual and textual processing with multi-level fusion. RoBERTa is used for extracting sentiment from textual input, while visual features are handled by the Vision Transformer (ViT). These features are fused using several fusion approaches, including Concatenation, Attention, Visual-Sentiment Co-Attention (VSCA), and Cross-Attention. Our experiments demonstrate that SDCG significantly outperforms baseline models such as the Generalized Image Transformer (GIT), which achieves 82.01%, and Bootstrapping Language-Image Pre-training (BLIP), which achieves 83.07%, in sentiment accuracy. While SDCG achieves 94.52% sentiment accuracy and improves scores in BLEU and ROUGE-L, the model demonstrates clear advantages. More importantly, the captions are more natural, as they incorporate emotional cues and contextual awareness, making them resemble those written by a human.},
DOI = {10.32604/cmc.2025.065872}
}