@Article{cmc.2026.083337,
AUTHOR = {Chien-Hao Tseng, Min-Yu Chen, Meng-Wei Lin, Jyh-Horng Wu, Chung-I Huang},
TITLE = {A Grounded Multi-Agent Multimodal Large Language Model Framework for Interpretable Risk Assessment in Driving Scenes},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/27296},
ISSN = {1546-2226},
ABSTRACT = {Context-aware driving assistance must do more than detect objects: it has to identify the cues that materially affect risk, separate observable evidence from inference, and produce recommendations that humans can audit. This paper presents a grounded multi-agent multimodal large language model (MLLM) framework for interpretable risk assessment in driving scenes. The framework decomposes reasoning into four stages—context relevance evaluation, visual interpretation, factual verification with anomaly extraction, and risk assessment with action recommendation—so that the final advisory is generated only from a verified intermediate representation rather than directly from a free-form scene description. We evaluate the framework on a manually labeled benchmark derived from BDD100K covering traffic-sign interpretation, traffic-density assessment, and pedestrian–vehicle interaction risk. The benchmark contains 600 frames with three-rater annotation and majority-vote labels (Fleiss’ <mml:math id="mml-ieqn-1"><mml:mi>κ</mml:mi><mml:mo>=</mml:mo><mml:mn>0.79</mml:mn></mml:math> on risk levels); we explicitly discuss the implications of this scale for generalization and complement it with a multi-backbone stress test. Across five independent runs, the proposed framework improves risk accuracy from <mml:math id="mml-ieqn-2"><mml:mn>74.3</mml:mn><mml:mo>±</mml:mo><mml:mn>0.9</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> to <mml:math id="mml-ieqn-3"><mml:mn>84.8</mml:mn><mml:mo>±</mml:mo><mml:mn>0.6</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> and macro-F1 from <mml:math id="mml-ieqn-4"><mml:mn>72.8</mml:mn><mml:mo>±</mml:mo><mml:mn>1.1</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> to <mml:math id="mml-ieqn-5"><mml:mn>83.1</mml:mn><mml:mo>±</mml:mo><mml:mn>0.7</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> over a single-agent MLLM baseline. The hallucination rate—defined as the fraction of outputs containing at least one entity, attribute, or relation that has no visual support in the source frame—drops from <mml:math id="mml-ieqn-6"><mml:mn>18.7</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math> to <mml:math id="mml-ieqn-7"><mml:mn>8.9</mml:mn><mml:mi mathvariant="normal">%</mml:mi></mml:math>, and the actionability score—a five-point human rating averaged over usefulness, specificity, and visual consistency—rises from <mml:math id="mml-ieqn-8"><mml:mn>3.62</mml:mn></mml:math> to <mml:math id="mml-ieqn-9"><mml:mn>4.28</mml:mn></mml:math>. McNemar tests confirm that the gain in risk accuracy is statistically significant (<mml:math id="mml-ieqn-10"><mml:mi>p</mml:mi><mml:mo>&lt;</mml:mo><mml:mn>0.001</mml:mn></mml:math>). The framework is intended as a semantic decision-support layer for explainable advanced driver-assistance systems and human-centered autonomous-driving interfaces.},
DOI = {10.32604/cmc.2026.083337}
}