@Article{cmc.2025.061977,
AUTHOR = {Omar
 Alqahtani, Mohamed Ghouse, Asfia Sabahath, Omer Bin Hussain, Arshiya Begum},
TITLE = {Multi-Scale Vision Transformer with Dynamic Multi-Loss Function for Medical Image Retrieval and Classification},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {83},
YEAR = {2025},
NUMBER = {2},
PAGES = {2221--2244},
URL = {http://www.techscience.com/cmc/v83n2/60568},
ISSN = {1546-2226},
ABSTRACT = {This paper introduces a novel method for medical image retrieval and classification by integrating a multi-scale encoding mechanism with Vision Transformer (ViT) architectures and a dynamic multi-loss function. The multi-scale encoding significantly enhances the model’s ability to capture both fine-grained and global features, while the dynamic loss function adapts during training to optimize classification accuracy and retrieval performance. Our approach was evaluated on the ISIC-2018 and ChestX-ray14 datasets, yielding notable improvements. Specifically, on the ISIC-2018 dataset, our method achieves an F1-Score improvement of +4.84% compared to the standard ViT, with a precision increase of +5.46% for melanoma (MEL). On the ChestX-ray14 dataset, the method delivers an F1-Score improvement of 5.3% over the conventional ViT, with precision gains of +5.0% for pneumonia (PNEU) and +5.4% for fibrosis (FIB). Experimental results demonstrate that our approach outperforms traditional CNN-based models and existing ViT variants, particularly in retrieving relevant medical cases and enhancing diagnostic accuracy. These findings highlight the potential of the proposed method for large-scale medical image analysis, offering improved tools for clinical decision-making through superior classification and case comparison.},
DOI = {10.32604/cmc.2025.061977}
}