@Article{cmc.2025.063852,
AUTHOR = {Xiaoguang Lv, Tao Liu, Han Qin, Ying Guo, Jingshan Pan, Dawei Zhao, Xiaoming Wu, Meihong Yang},
TITLE = {SW-DDFT: Parallel Optimization of the Dynamical Density Functional Theory Algorithm Based on Sunway Bluelight II Supercomputer},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {84},
YEAR = {2025},
NUMBER = {1},
PAGES = {1417--1436},
URL = {http://www.techscience.com/cmc/v84n1/61749},
ISSN = {1546-2226},
ABSTRACT = {The Dynamical Density Functional Theory (DDFT) algorithm, derived by associating classical Density Functional Theory (DFT) with the fundamental Smoluchowski dynamical equation, describes the evolution of inhomogeneous fluid density distributions over time. It plays a significant role in studying the evolution of density distributions over time in inhomogeneous systems. The Sunway Bluelight II supercomputer, as a new generation of China’s developed supercomputer, possesses powerful computational capabilities. Porting and optimizing industrial software on this platform holds significant importance. For the optimization of the DDFT algorithm, based on the Sunway Bluelight II supercomputer and the unique hardware architecture of the SW39000 processor, this work proposes three acceleration strategies to enhance computational efficiency and performance, including direct parallel optimization, local-memory constrained optimization for CPEs, and multi-core groups collaboration and communication optimization. This method combines the characteristics of the program’s algorithm with the unique hardware architecture of the Sunway Bluelight II supercomputer, optimizing the storage and transmission structures to achieve a closer integration of software and hardware. For the first time, this paper presents Sunway-Dynamical Density Functional Theory (SW-DDFT). Experimental results show that SW-DDFT achieves a speedup of 6.67 times within a single-core group compared to the original DDFT implementation, with six core groups (a total of 384 CPEs), the maximum speedup can reach 28.64 times, and parallel efficiency can reach 71%, demonstrating excellent acceleration performance.},
DOI = {10.32604/cmc.2025.063852}
}