
@Article{cmc.2026.077916,
AUTHOR = {Jinjiang Lin, Yuan Lu, Han Li, Xiaolong Cai, Enyi Chen, Jiansheng Guan},
TITLE = {Secondary Realignment: An Embodied Intelligent Operational Framework Integrating Vision-Language and Action Two-Stage Models},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/26705},
ISSN = {1546-2226},
ABSTRACT = {Manipulating objects based on verbal commands in cluttered environments remains a critical challenge in robotic arm research. Verbal commands possess high semantic abstraction, while precise grasping and placement actions rely on fine-grained geometric perception. The disparity between these two domains is the primary cause of operational errors. Particularly in certain cluttered scenarios, visual-spatial noise and background redundancy further disrupt attention distribution, significantly degrading the generalization capabilities of existing methods in unseen environments. To address these issues, this paper proposes the Secondary Realignment (SR) framework. It decouples vision-language alignment and vision-action alignment into two stages, mitigating semantic-geometric discrepancies through a hierarchical approach to substantially reduce errors in cross-modal mapping. Simultaneously, to address noise and redundancy in visual-language features, we design a Deep Sparse Self-Attention (DSSA) module. This module dynamically fuses sparse and dense attention mechanisms through self-learning parameters, adaptively enhancing relevant features while suppressing irrelevant noise. Extensive simulation experimental results demonstrate that compared to the state-of-the-art method A2, our approach achieves 9.7%, 9.9%, and 17.6% higher task success rates in grasping, placing, and pick-and-place tasks, respectively, further validating its effectiveness.},
DOI = {10.32604/cmc.2026.077916}
}



