@Article{cmc.2026.081460,
AUTHOR = {Shing-Tai Pan, Yi-Zhen Huang, Zhi-Qing Chen},
TITLE = {Improvement of Emotion Detection by Fusing Speech and Image Based on CNN with Temporal Models},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/27376},
ISSN = {1546-2226},
ABSTRACT = {This paper proposes a multimodal fusion framework that integrates speech and visual features to enhance the accuracy of emotion recognition. The principal contribution lies in extending the visual component from single-image to multi-image emotion recognition. Specifically, the proposed framework employs an InceptionV3 Convolutional Neural Network (CNN)-based architecture to extract features from multiple facial images representing the speaker’s expressions throughout an utterance. These features are concatenated into a single vector and subsequently processed by Long Short-Term Memory (LSTM) or Hidden Markov Model (HMM) for temporal modeling. For the speech modality, Mel-Frequency Cepstral Coefficients (MFCC) or filter bank features are extracted from processed audio signals and fed into a hybrid CNN–time-series model. The two modalities are then integrated through model-level and decision-level fusion strategies. Since recognition accuracy tends to degrade as the number of utterances and speakers increases, the Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS), which contains a moderate number of sentences and speakers, is adopted in this study. Experimental results demonstrate that the proposed multi-image approach improves recognition accuracy from 91% to 96% compared with the single-image baseline, and that the multimodal fusion framework consistently outperforms its single-modal counterpart.},
DOI = {10.32604/cmc.2026.081460}
}