@Article{cmc.2023.031177,
AUTHOR = {Mohammad Amaz Uddin, Mohammad Salah Uddin Chowdury, Mayeen Uddin Khandaker, Nissren Tamam, Abdelmoneim Sulieman},
TITLE = {The Efficacy of Deep Learning-Based Mixed Model for Speech Emotion Recognition},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {74},
YEAR = {2023},
NUMBER = {1},
PAGES = {1709--1722},
URL = {http://www.techscience.com/cmc/v74n1/49808},
ISSN = {1546-2226},
ABSTRACT = {Human speech indirectly represents the mental state or emotion of others. The use of Artificial Intelligence (AI)-based techniques may bring revolution in this modern era by recognizing emotion from speech. In this study, we introduced a robust method for emotion recognition from human speech using a well-performed preprocessing technique together with the deep learning-based mixed model consisting of Long Short-Term Memory (LSTM) and Convolutional Neural Network (CNN). About 2800 audio files were extracted from the Toronto emotional speech set (TESS) database for this study. A high pass and Savitzky Golay Filter have been used to obtain noise-free as well as smooth audio data. A total of seven types of emotions; Angry, Disgust, Fear, Happy, Neutral, Pleasant-surprise, and Sad were used in this study. Energy, Fundamental frequency, and Mel Frequency Cepstral Coefficient (MFCC) have been used to extract the emotion features, and these features resulted in 97.5% accuracy in the mixed LSTM + CNN model. This mixed model is found to be performed better than the usual state-of-the-art models in emotion recognition from speech. It also indicates that this mixed model could be effectively utilized in advanced research dealing with sound processing.},
DOI = {10.32604/cmc.2023.031177}
}