@Article{cmes.2025.068726,
AUTHOR = {Yazeed Alkhrijah, Shehzad Khalid, Syed Muhammad Usman, Amina Jameel, Danish Hamid},
TITLE = {Fusing Geometric and Temporal Deep Features for High-Precision Arabic Sign Language Recognition},
JOURNAL = {Computer Modeling in Engineering \& Sciences},
VOLUME = {144},
YEAR = {2025},
NUMBER = {1},
PAGES = {1113--1141},
URL = {http://www.techscience.com/CMES/v144n1/63309},
ISSN = {1526-1506},
ABSTRACT = {Arabic Sign Language (ArSL) recognition plays a vital role in enhancing the communication for the Deaf and Hard of Hearing (DHH) community. Researchers have proposed multiple methods for automated recognition of ArSL; however, these methods face multiple challenges that include high gesture variability, occlusions, limited signer diversity, and the scarcity of large annotated datasets. Existing methods, often relying solely on either skeletal data or video-based features, struggle with generalization and robustness, especially in dynamic and real-world conditions. This paper proposes a novel multimodal ensemble classification framework that integrates geometric features derived from 3D skeletal joint distances and angles with temporal features extracted from RGB videos using the Inflated 3D ConvNet (I3D). By fusing these complementary modalities at the feature level and applying a majority-voting ensemble of XGBoost, Random Forest, and Support Vector Machine classifiers, the framework robustly captures both spatial configurations and motion dynamics of sign gestures. Feature selection using the Pearson Correlation Coefficient further enhances efficiency by reducing redundancy. Extensive experiments on the ArabSign dataset, which includes RGB videos and corresponding skeletal data, demonstrate that the proposed approach significantly outperforms state-of-the-art methods, achieving an average F1-score of 97% using a majority-voting ensemble of XGBoost, Random Forest, and SVM classifiers, and improving recognition accuracy by more than 7% over previous best methods. This work not only advances the technical state-of-the-art in ArSL recognition but also provides a scalable, real-time solution for practical deployment in educational, social, and assistive communication technologies. Even though this study is about Arabic Sign Language, the framework proposed here can be extended to different sign languages, creating possibilities for potentially worldwide applicability in sign language recognition tasks.},
DOI = {10.32604/cmes.2025.068726}
}