@Article{icces.2019.05390,
AUTHOR = {Hideo Matsufuru, Kohsuke Sumiyoshi},
TITLE = {Simulations of Core Collapse Supernova Explosion on PEZY-SC Processors and GPUs},
JOURNAL = {The International Conference on Computational \& Experimental Engineering and Sciences},
VOLUME = {21},
YEAR = {2019},
NUMBER = {4},
PAGES = {90--90},
URL = {http://www.techscience.com/icces/v21n4/32532},
ISSN = {1933-2815},
ABSTRACT = {The core collapse supernovae are one of key phenomena to understand the history of the Universe and the origin of heavy elements. To understand their explosion mechanism, large scale numerical simulations are essential that require to solve a multi-physics system described by coupled equations of hydrodynamics and neutrino-radiation transfer in multidimensions. Since the neutrino transfer is governed by the Boltzmann equation in six-dimensional space, necessary computational resource rapidly increases as the number of grids in simulations grows. So far numerical studies have been performed mostly on massively parallel computers and only a few studies have been made using accelerator architectures, such as GPUs, despite their large potential. The PEZY-SC processors are novel many-core architecture that have tremendous potential for scientific high-performance computing. While they share typical features with GPUs, one needs to port and optimize an application considering the difference in their multi-level structure of cores and caches to fully make use of their computational performance. In this work, we apply the PEZY-SC processors to the numerical computations of neutrino radiation hydrodynamics under spherically symmetry as a prototype of the multi-dimensional supernova simulations. Adopting the implicit scheme for discretized time evolution, the bottlenecks of the simulations are as follows: (a) An iterative linear equation solver for the coefficient matrix in the evolution equation, (b) Computation of collision term in the Boltzmann equation, and (c) Inversion of a block diagonal matrix used for preconditioning. The most time-consuming part is typically (a), while (b) and (c) are non-negligible. The steps (a) and (c) concern a block tridiagonal matrix composed of O(1000) block matrices each having O(500) ranks. We offload these hot spots to PEZY-SC processors employing PEZY-CL (a variant of OpenCL) in our simulation code based on our previous studies to exploit the GPUs using NVIDIA CUDA framework. We measure the performance on Suiren Blue (wth PEZY-SC) and Suiren2 (PEZY-SC2) systems at KEK and compared with the result on a system with NVIDIA P100 processors. The achieved performance is sufficient for simulations with improved resolutions than the previous model size, which are not sufficient for recent observational progress. We also discuss the similarity and difference of these architectures and how to ease the porting effort as a prospect for application to the multi-dimension simulations.},
DOI = {10.32604/icces.2019.05390}
}