@Article{cmc.2025.072903,
AUTHOR = {Zhongfan Sun, Kan Guo, Yongli Hu, Yong Zhang},
TITLE = {Metacognition Inspired Reflective Chain-of-Thought for Knowledge-Based VQA},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {87},
YEAR = {2026},
NUMBER = {1},
PAGES = {--},
URL = {http://www.techscience.com/cmc/v87n1/66044},
ISSN = {1546-2226},
ABSTRACT = {Knowledge-based Visual Question Answering (VQA) requires the integration of visual information with external knowledge reasoning. Existing approaches typically retrieve information from external corpora and rely on pretrained language models for reasoning. However, their performance is often hindered by the limited capabilities of retrievers and the constrained size of knowledge bases. Moreover, relying on image captions to bridge the modal gap between visual and language modalities can lead to the omission of critical visual details. To address these limitations, we propose the <b>Re</b>flective <b>C</b>hain-<b>o</b>f-<b>T</b>hought (<b>ReCoT</b>) method, a simple yet effective framework inspired by metacognition theory. ReCoT effectively activates the reasoning capabilities of Multimodal Large Language Models (MLLMs), providing essential visual and knowledge cues required to solve complex visual questions. It simulates a metacognitive reasoning process that encompasses monitoring, reflection, and correction. Specifically, in the initial generation stage, an MLLM produces a preliminary answer that serves as the model’s initial cognitive output. During the reflective reasoning stage, this answer is critically examined to generate a reflective rationale that integrates key visual evidence and relevant knowledge. In the final refinement stage, a smaller language model leverages this rationale to revise the initial prediction, resulting in a more accurate final answer. By harnessing the strengths of MLLMs in visual and knowledge grounding, ReCoT enables smaller language models to reason effectively without dependence on image captions or external knowledge bases. Experimental results demonstrate that ReCoT achieves substantial performance improvements, outperforming state-of-the-art methods by 2.26% on OK-VQA and 5.8% on A-OKVQA.},
DOI = {10.32604/cmc.2025.072903}
}