@Article{cmc.2026.080025,
AUTHOR = {Rashid Jahangir, Muhammad Asif Nauman, Oumaima Saidani, Faisal Ramzan},
TITLE = {A Hybrid CNN–BiLSTM Framework for Speech Emotion Recognition with TimeGAN-Augmented Data and Contrastive Learning},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/27243},
ISSN = {1546-2226},
ABSTRACT = {Speech Emotion Recognition (SER) is a critical component of affective computing with broad applications in human–computer interaction, mental health monitoring, and intelligent multimedia systems. However, SER remains challenging due to the emotional ambiguity, lack of labeled data, class imbalance, and speaker variability. This study presents an effective SER framework that integrates contrastive representation learning, optimized spectrogram-based data augmentation, and selective synthetic data generation by using TimeGAN to enhance emotion classification performance. Contrastive learning enables the model to better discriminate acoustically similar emotions while Optuna automatically tunes augmentation strategies such as noise injection, time shifting, and time-frequency masking. Unlike existing approaches that apply synthetic generation uniformly across all classes, the proposed method targets only confusing or under-represented emotion classes to preserve the inter-class separability. A CNN-BiLSTM architecture is used to extract spectral and temporal information of the speech. The framework is evaluated with benchmark SER datasets—EMO-DB and RAVDESS—under speaker independent protocols. Experimental results demonstrate improved accuracy, robustness, and generalization under limited and imbalanced data conditions, supported by confusion matrices, UMAP, and t-SNE visualizations.},
DOI = {10.32604/cmc.2026.080025}
}