@Article{cmc.2026.074115,
AUTHOR = {Mohammed Alnusayri, Tingting Xue, Saleha Kamal, Nouf Abdullah Almujally, Khaled Alnowaiser, Ahmad Jalal, Hui Liu},
TITLE = {Group Activity Recognition in Crowded Scenes Using Multi-Stage Feature Optimization and ST-GCN-LSTM Networks},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26611},
ISSN = {1546-2226},
ABSTRACT = {Group activity recognition in public environments is challenging due to dynamic formations, complex inter-person interactions, and frequent occlusions. Existing methods often emphasize individual actions, overlooking collective behavioral patterns. This work introduces a multi-modal framework integrating silhouette-based appearance and skeleton-based pose information for robust recognition in surveillance scenarios. You Only Look Once v11 (YOLOv11) detects persons, Segmenting Objects by LOcations version 2 (SOLOv2) segments instances, and AlphaPose extracts skeletons, followed by hierarchical grouping to form spatially coherent clusters. A hybrid feature extraction strategy combines handcrafted descriptors (Extended GIST (ExGIST), Distance Transform, Binary Robust Independent Elementary Features (BRIEF), Ridge) with deep representations, fused via multi-head attention. Feature selection is refined through a three-stage pipeline of Kernel Principal Component Analysis (K-PCA), mutual information ranking, and genetic algorithm-based optimization. Spatio-Temporal Graph Convolution Networks (ST-GCN) models spatio-temporal dependencies, while Long Short-Term Memory (LSTM) captures long-term dynamics for activity classification. On the Collective Activity Dataset (CAD), the framework achieves 96.80% accuracy, surpassing state-of-the-art approaches. Its modular design ensures scalability and adaptability for intelligent surveillance and smart city applications.},
DOI = {10.32604/cmc.2026.074115}
}