
@Article{cmc.2025.059102,
AUTHOR = {Naikang Zhong, Xiao Lin, Wen Du, Jin Shi},
TITLE = {Multi-Scale Feature Fusion and Advanced Representation Learning for Multi Label Image Classification},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {82},
YEAR = {2025},
NUMBER = {3},
PAGES = {5285--5306},
URL = {http://www.techscience.com/cmc/v82n3/59887},
ISSN = {1546-2226},
ABSTRACT = {Multi-label image classification is a challenging task due to the diverse sizes and complex backgrounds of objects in images. Obtaining class-specific precise representations at different scales is a key aspect of feature representation. However, existing methods often rely on the single-scale deep feature, neglecting shallow and deeper layer features, which poses challenges when predicting objects of varying scales within the same image. Although some studies have explored multi-scale features, they rarely address the flow of information between scales or efficiently obtain class-specific precise representations for features at different scales. To address these issues, we propose a two-stage, three-branch Transformer-based framework. The first stage incorporates multi-scale image feature extraction and hierarchical scale attention. This design enables the model to consider objects at various scales while enhancing the flow of information across different feature scales, improving the model’s generalization to diverse object scales. The second stage includes a global feature enhancement module and a region selection module. The global feature enhancement module strengthens interconnections between different image regions, mitigating the issue of incomplete representations, while the region selection module models the cross-modal relationships between image features and labels. Together, these components enable the efficient acquisition of class-specific precise feature representations. Extensive experiments on public datasets, including COCO2014, VOC2007, and VOC2012, demonstrate the effectiveness of our proposed method. Our approach achieves consistent performance gains of 0.3%, 0.4%, and 0.2% over state-of-the-art methods on the three datasets, respectively. These results validate the reliability and superiority of our approach for multi-label image classification.},
DOI = {10.32604/cmc.2025.059102}
}



