@Article{cmc.2026.079959,
AUTHOR = {Chengjing Li, Li Wang, Xiaoyan Zhao},
TITLE = {Safe Robot Control through Multi-Task Offline Reinforcement Learning with Multi-Scale Distribution Debiasing},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26521},
ISSN = {1546-2226},
ABSTRACT = {Robots perform diverse tasks in real-world scenarios. In safety-critical applications, robot control must prioritize satisfying safety constraints in addition to achieving high performance. Offline safe reinforcement learning avoids risky online exploration by training from a given dataset. However, most existing methods overlook two issues in offline data. First, non-zero cost signals are typically sparse, which leads to inaccurate cost value estimates and makes it difficult to impose effective safety constraints on the policy. Second, an imbalanced dataset biases policy learning toward unsafe behaviors. To address these challenges, we propose an actor-critic method ARMOR (multi-sc<b>A</b>le <b>R</b>eweighting with <b>M</b>ulti-task <b>O</b>ffline c<b>R</b>itic). The multi-task critic treats reward, long-term cost, and short-term cost as multiple tasks, learns shared representations to capture common state information, and leverages dense reward signals to stabilize learning under sparse cost signals. To mitigate dataset imbalance, ARMOR performs counterfactual reasoning with the short-term cost to upweight critical safe transitions near the risk boundary and assigns higher weights to low-cost trajectories. It then performs multi-scale reweighting by combining transition-level and trajectory-level weights to debias data distribution and emphasize safe demonstrations. The actor is parameterized by a conditional diffusion policy and trained via weighted behavior cloning. ARMOR additionally incorporates a reward-guided objective and a long-term cost constraint to improve the reward-cost trade-off. Extensive experiments on continuous-control robot tasks show that ARMOR achieves competitive performance under safety constraints, with clear advantages in several challenging environments. Furthermore, ARMOR exhibits zero-shot adaptation capability, making it suitable for practical deployment.},
DOI = {10.32604/cmc.2026.079959}
}