@Article{cmc.2025.063726,
AUTHOR = {Sameera V Mohd Sagheer, Meghana K H, P M Ameer, Muneer Parayangat, Mohamed Abbas},
TITLE = {Transformers for Multi-Modal Image Analysis in Healthcare},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {84},
YEAR = {2025},
NUMBER = {3},
PAGES = {4259--4297},
URL = {http://www.techscience.com/cmc/v84n3/63129},
ISSN = {1546-2226},
ABSTRACT = {Integrating multiple medical imaging techniques, including Magnetic Resonance Imaging (MRI), Computed Tomography, Positron Emission Tomography (PET), and ultrasound, provides a comprehensive view of the patient health status. Each of these methods contributes unique diagnostic insights, enhancing the overall assessment of patient condition. Nevertheless, the amalgamation of data from multiple modalities presents difficulties due to disparities in resolution, data collection methods, and noise levels. While traditional models like Convolutional Neural Networks (CNNs) excel in single-modality tasks, they struggle to handle multi-modal complexities, lacking the capacity to model global relationships. This research presents a novel approach for examining multi-modal medical imagery using a transformer-based system. The framework employs self-attention and cross-attention mechanisms to synchronize and integrate features across various modalities. Additionally, it shows resilience to variations in noise and image quality, making it adaptable for real-time clinical use. To address the computational hurdles linked to transformer models, particularly in real-time clinical applications in resource-constrained environments, several optimization techniques have been integrated to boost scalability and efficiency. Initially, a streamlined transformer architecture was adopted to minimize the computational load while maintaining model effectiveness. Methods such as model pruning, quantization, and knowledge distillation have been applied to reduce the parameter count and enhance the inference speed. Furthermore, efficient attention mechanisms such as linear or sparse attention were employed to alleviate the substantial memory and processing requirements of traditional self-attention operations. For further deployment optimization, researchers have implemented hardware-aware acceleration strategies, including the use of TensorRT and ONNX-based model compression, to ensure efficient execution on edge devices. These optimizations allow the approach to function effectively in real-time clinical settings, ensuring viability even in environments with limited resources. Future research directions include integrating non-imaging data to facilitate personalized treatment and enhancing computational efficiency for implementation in resource-limited environments. This study highlights the transformative potential of transformer models in multi-modal medical imaging, offering improvements in diagnostic accuracy and patient care outcomes.},
DOI = {10.32604/cmc.2025.063726}
}