@Article{cmes.2026.080335, AUTHOR = {Faten S. Alamri, Noor Ayesha, Afia Zafar, Adil Ali Saleem, Amjad R. Khan}, TITLE = {Multimodal Graph-Enhanced Vision Transformer for Interpretable Skin Lesion Classification}, JOURNAL = {Computer Modeling in Engineering \& Sciences}, VOLUME = {147}, YEAR = {2026}, NUMBER = {1}, PAGES = {0--0}, URL = {http://www.techscience.com/CMES/v147n1/67156}, ISSN = {1526-1506}, ABSTRACT = {The use of automated skin lesion classification is still a disadvantage, since there is a great visual similarity between benign and malignant lesions. The majority of deep learning methods utilize dermoscopic images only, without taking into account clinical metadata employed by dermatologists on a regular basis. The following paper proposes a vision-graph multimodal framework that links Image encoding to graph neural networks based on metadata representation through the fusion of learnable attention. The framework focuses on three limitations, which are underutilization of clinical context, absence of interpretability, and suboptimal incorporation of modalities. Gradient-weighted Class Activation Mapping++ (Grad-CAM++) is used to obtain dual explainability of visual attention, and SHapley Additive exPlanations (SHAP) to obtain feature importance. Examining the HAM10000 and Derm7pt datasets, statistically significant advances (p < 0.001) of 89.3% and 92.1% accuracy are obtained, which is 4.1% and 2.7% higher than baselines that can only use images. Focusing on weight analysis will provide metadata with 37.7% averaged variance with an error of 8.4%, which confirms the clinical importance of multimodal modeling. The study of ablation shows that graph-based metadata encoding is 1.4% better than standard multilayer perceptron encoding (p = 0.003).}, DOI = {10.32604/cmes.2026.080335} }