@Article{cmes.2026.075080,
AUTHOR = {Sheng Luo, Rashid Abbasi, Hao Wang, Jinghua Xu, Dongyang Lyu, Aaron Zhang, Farhan Amin, Isabel de la Torre, Gerardo Mendez Mezquita, Henry Fabian Gongora},
TITLE = {Robust Human Pose Estimation and Action Recognition Utilizing Feature Extraction},
JOURNAL = {Computer Modeling in Engineering \& Sciences},
VOLUME = {146},
YEAR = {2026},
NUMBER = {3},
PAGES = {0--0},
URL = {http://www.techscience.com/CMES/v146n3/66790},
ISSN = {1526-1506},
ABSTRACT = {Human pose estimation is crucial across diverse applications, from healthcare to human–computer interaction. Integrating inertial measurement units (IMUs) with monocular vision methods holds great potential for leveraging complementary modalities; however, existing approaches are often limited by IMU drift, noise, and underutilization of visual information. To address these limitations, we propose a novel dual-stream feature extraction framework that effectively combines temporal IMU data and single-view image features for improved pose estimation. Short-term dependencies in IMU sequences are captured with convolutional layers, while a Transformer-based architecture models long-range temporal dynamics. To mitigate IMU drift and inter-sensor inconsistencies, a complementary filtering module is introduced alongside a cross-channel interaction mechanism. Features from the IMU and image streams are then fused via a dedicated fusion module and further refined utilizing a high-precision regression head for accurate pose prediction. Experimental results on benchmark datasets demonstrate that our method significantly outperforms existing techniques in terms of estimation, accuracy, and robustness, validating the effectiveness of our dual-stream architecture.},
DOI = {10.32604/cmes.2026.075080}
}