
@Article{cmc.2025.069058,
AUTHOR = {Noman Khan, Afnan, Mi Young Lee, Jakyoung Min},
TITLE = {Head-Body Guided Deep Learning Framework for Dog Breed Recognition},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {85},
YEAR = {2025},
NUMBER = {2},
PAGES = {2935--2958},
URL = {http://www.techscience.com/cmc/v85n2/63849},
ISSN = {1546-2226},
ABSTRACT = {Fine-grained dog breed classification presents significant challenges due to subtle inter-class differences, pose variations, and intra-class diversity. To address these complexities and limitations of traditional handcrafted approaches, a novel and efficient two-stage Deep Learning (DL) framework tailored for robust fine-grained classification is proposed. In the first stage, a lightweight object detector, YOLO v8N (You Only Look Once Version 8 Nano), is fine-tuned to localize both the head and full body of the dog from each image. In the second stage, a dual-stream Vision Transformer (ViT) architecture independently processes the detected head and body regions, enabling the extraction of region-specific, complementary features. This dual-path approach improves feature discriminability by capturing localized cues that are vital for distinguishing visually similar breeds. The proposed framework introduces several key innovations: (1) a modular and lightweight head–body detection pipeline that balances accuracy with computational efficiency, (2) a region-aware ViT model that leverages spatial attention for enhanced fine-grained recognition, and (3) a training scheme incorporating advanced augmentations and structured supervision to maximize generalization. These contributions collectively enhance model performance while maintaining deployment efficiency. Extensive experiments conducted on the Tsinghua Dogs dataset validate the effectiveness of the approach. The model achieves an accuracy of 90.04%, outperforming existing State-of-the-Art (SOTA) methods across all key evaluation metrics. Furthermore, statistical significance testing confirms the robustness of the observed improvements over multiple baselines. The proposed method presents an effective solution for breed recognition tasks and shows strong potential for broader applications, including pet surveillance, veterinary diagnostics, and cross-species classification. Notably, it achieved an accuracy of 96.85% on the Oxford-IIIT Pet dataset, demonstrating its robustness across different species and breeds.},
DOI = {10.32604/cmc.2025.069058}
}



