@Article{cmc.2024.057392,
AUTHOR = {Wenju Wang, Zehua Gu, Bang Tang, Sen Wang, Jianfei Hao},
TITLE = {ACSF-ED: Adaptive Cross-Scale Fusion Encoder-Decoder for Spatio-Temporal Action Detection},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {82},
YEAR = {2025},
NUMBER = {2},
PAGES = {2389--2414},
URL = {http://www.techscience.com/cmc/v82n2/59445},
ISSN = {1546-2226},
ABSTRACT = {Current spatio-temporal action detection methods lack sufficient capabilities in extracting and comprehending spatio-temporal information. This paper introduces an end-to-end Adaptive Cross-Scale Fusion Encoder-Decoder (ACSF-ED) network to predict the action and locate the object efficiently. In the Adaptive Cross-Scale Fusion Spatio-Temporal Encoder (ACSF ST-Encoder), the Asymptotic Cross-scale Feature-fusion Module (ACCFM) is designed to address the issue of information degradation caused by the propagation of high-level semantic information, thereby extracting high-quality multi-scale features to provide superior features for subsequent spatio-temporal information modeling. Within the Shared-Head Decoder structure, a shared classification and regression detection head is constructed. A multi-constraint loss function composed of one-to-one, one-to-many, and contrastive denoising losses is designed to address the problem of insufficient constraint force in predicting results with traditional methods. This loss function enhances the accuracy of model classification predictions and improves the proximity of regression position predictions to ground truth objects. The proposed method model is evaluated on the popular dataset UCF101-24 and JHMDB-21. Experimental results demonstrate that the proposed method achieves an accuracy of 81.52% on the Frame-mAP metric, surpassing current existing methods.},
DOI = {10.32604/cmc.2024.057392}
}