@Article{cmc.2025.072286,
AUTHOR = {Zilin Zhang, Yan Liu, Jia Liu, Senbao Hou, Yuping Zhang, Chenyuan Wang},
TITLE = {A Multimodal Sentiment Analysis Method Based on Multi-Granularity Guided Fusion},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {86},
YEAR = {2026},
NUMBER = {2},
PAGES = {1--14},
URL = {http://www.techscience.com/cmc/v86n2/64796},
ISSN = {1546-2226},
ABSTRACT = {With the growing demand for more comprehensive and nuanced sentiment understanding, Multimodal Sentiment Analysis (MSA) has gained significant traction in recent years and continues to attract widespread attention in the academic community. Despite notable advances, existing approaches still face critical challenges in both information modeling and modality fusion. On one hand, many current methods rely heavily on encoders to extract global features from each modality, which limits their ability to capture latent fine-grained emotional cues within modalities. On the other hand, prevailing fusion strategies often lack mechanisms to model semantic discrepancies across modalities and to adaptively regulate modality interactions. To address these limitations, we propose a novel framework for MSA, termed Multi-Granularity Guided Fusion (MGGF). The proposed framework consists of three core components: (i) Multi-Granularity Feature Extraction Module, which simultaneously captures both global and local emotional features within each modality, and integrates them to construct richer intra-modal representations; (ii) Cross-Modal Guidance Learning Module (CMGL), which introduces a cross-modal scoring mechanism to quantify the divergence and complementarity between modalities. These scores are then used as guiding signals to enable the fusion strategy to adaptively respond to scenarios of modality agreement or conflict; (iii) Cross-Modal Fusion Module (CMF), which learns the semantic dependencies among modalities and facilitates deep-level emotional feature interaction, thereby enhancing sentiment prediction with complementary information. We evaluate MGGF on two benchmark datasets: MVSA-Single and MVSA-Multiple. Experimental results demonstrate that MGGF outperforms the current state-of-the-art model CLMLF on MVSA-Single by achieving a 2.32% improvement in F1 score. On MVSA-Multiple, it surpasses MGNNS with a 0.26% increase in accuracy. These results substantiate the effectiveness of MGGF in addressing two major limitations of existing methods—insufficient intra-modal fine-grained sentiment modeling and inadequate cross-modal semantic fusion.},
DOI = {10.32604/cmc.2025.072286}
}