@Article{cmc.2025.063723,
AUTHOR = {Go-Eun Woo, Sang-Bo Park, Gi-Tae Park, Muhammad Junaid, Hyung-Won Kim},
TITLE = {Low-Complexity Hardware Architecture for Batch Normalization of CNN Training Accelerator},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {84},
YEAR = {2025},
NUMBER = {2},
PAGES = {3241--3257},
URL = {http://www.techscience.com/cmc/v84n2/62876},
ISSN = {1546-2226},
ABSTRACT = {On-device Artificial Intelligence (AI) accelerators capable of not only inference but also training neural network models are in increasing demand in the industrial AI field, where frequent retraining is crucial due to frequent production changes. Batch normalization (BN) is fundamental to training convolutional neural networks (CNNs), but its implementation in compact accelerator chips remains challenging due to computational complexity, particularly in calculating statistical parameters and gradients across mini-batches. Existing accelerator architectures either compromise the training accuracy of CNNs through approximations or require substantial computational resources, limiting their practical deployment. We present a hardware-optimized BN accelerator that maintains training accuracy while significantly reducing computational overhead through three novel techniques: (1) resource-sharing for efficient resource utilization across forward and backward passes, (2) interleaved buffering for reduced dynamic random-access memory (DRAM) access latencies, and (3) zero-skipping for minimal gradient computation. Implemented on a VCU118 Field Programmable Gate Array (FPGA) on 100 MHz and validated using You Only Look Once version 2-tiny (YOLOv2-tiny) on the PASCAL Visual Object Classes (VOC) dataset, our normalization accelerator achieves a 72% reduction in processing time and 83% lower power consumption compared to a 2.4 GHz Intel Central Processing Unit (CPU) software normalization implementation, while maintaining accuracy (0.51% mean Average Precision (mAP) drop at floating-point 32 bits (FP32), 1.35% at brain floating-point 16 bits (bfloat16)). When integrated into a neural processing unit (NPU), the design demonstrates 63% and 97% performance improvements over AMD CPU and Reduced Instruction Set Computing-V (RISC-V) implementations, respectively. These results confirm that our proposed BN hardware design enables efficient, high-accuracy, and power-saving on-device training for modern CNNs. Our results demonstrate that efficient hardware implementation of standard batch normalization is achievable without sacrificing accuracy, enabling practical on-device CNN training with significantly reduced computational and power requirements.},
DOI = {10.32604/cmc.2025.063723}
}