@Article{cmc.2026.077982,
AUTHOR = {Shuqiu Tan, Chunsheng Tan, Yahui Liu},
TITLE = {Hierarchical Joint Cross-Modal Attention and Gating Mechanism for Multimodal Sentiment Analysis},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26623},
ISSN = {1546-2226},
ABSTRACT = {Multimodal sentiment analysis aims to accurately identify emotional states by comprehensively utilizing information from multiple sources such as text, audio, and visual data. However, semantic heterogeneity and temporal differences exist between different modalities, limiting the effectiveness of feature fusion. To address this issue, this paper proposes a hierarchical joint cross-modal attention and gating mechanism (HJCAG) for multimodal sentiment analysis. This method introduces a hierarchical structure, dividing modal interactions into bimodal and trimodal layers to progressively model the semantic relevance between modalities. First, deep features are extracted from text, audio, and visual modalities using pre-trained models to obtain high-dimensional representations of semantics, speech, and facial expressions, which are then aligned to a unified feature space. Second, a joint cross-modal attention module is designed at the bimodal and trimodal levels, calculating cross-attention weights based on the correlation between the joint feature representation and individual modal representations. Explicit modeling of multimodal interaction relationships and semantic alignment fully leverages the complementary information of different modalities. Furthermore, this paper introduces a gating mechanism to adaptively control the contribution weights of each modal feature, reducing redundant information interference and improving the discriminativeness of the fused representation. Finally, the fused global features are input into the emotion classifier to identify emotional states. The proposed method achieves 75.47 ± 0.22% and 69.25 ± 0.37% accuracy and 76.84 ± 0.45% and 68.97 ± 0.41% weighted F1 scores on the Interactive Emotional Dyadic Motion Capture (IEMOCAP) database and Multimodal Emotion Lines Dataset (MELD), respectively, outperforming mainstream multimodal baseline methods, verifying the effectiveness and robustness of the proposed method in multimodal feature fusion and emotion recognition.},
DOI = {10.32604/cmc.2026.077982}
}