@Article{cmc.2026.078074,
AUTHOR = {Dong Zhang, Lianhe Shao, Weijie Xu, Xihan Wang, Quanli Gao},
TITLE = {Quantum-Inspired Complex-Valued Fusion Framework: Optimizing Intra-Modal Semantics and Inter-Modal Fusion in Multimodal Sarcasm Detection},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26557},
ISSN = {1546-2226},
ABSTRACT = {With the popularization of multimodal content on social media, accurately identifying sarcastic intent is of great significance for understanding public attitudes and grasping public opinion trends. However, sarcastic expressions rely on context, exhibit inconsistencies in multimodal information, and have implicitly contradictory semantics. These characteristics pose challenges to traditional single-text modality methods. Existing multimodal methods, due to their default assumption of symmetric modal interactions and difficulty in capturing the subtlety of sarcasm and modal contradictions, yield limited detection performance. Therefore, this paper proposes a quantum-inspired complex-valued fusion framework to optimize the intra-modal semantics and inter-modal fusion in multimodal sarcasm detection. Firstly, this framework constructs a quantum-inspired complex-valued multimodal feature representation method. It embeds the text, visual, and audio modalities into the complex-valued Hilbert space, and models the feature intensity and directional information, respectively, through the two dimensions of “amplitude-phase”, providing highly expressive basic features for fusion. Secondly, an asymmetric quantum interference fusion mechanism is designed. Based on the principle of quantum interference, a directional interference term and trainable parameters are introduced to accurately capture the asymmetric interaction relationship between modalities, where “text dominates semantic interpretation and vision supplements detailed evidence”, effectively mining the modal contradictions on which sarcasm depends. Experimental results show that the F1-score of the proposed model has increased by 3.71% and 2.74% compared with M2Seq2Seq and SRLM, respectively, on the Mustard dataset. On the Memotion dataset, it also achieves performance improvements of 0.28% and 0.83% relative to M2Seq2Seq and SRLM. The effectiveness of the key modules in the model is also verified through ablation experiments.},
DOI = {10.32604/cmc.2026.078074}
}