@Article{cmc.2026.081260,
AUTHOR = {Betül Şenyayla, Aytuğ Onan},
TITLE = {HalluBench: A Multi-LLM Benchmark for Hallucination Evaluation and Reliability Analysis},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/cmc/online/detail/27175},
ISSN = {1546-2226},
ABSTRACT = {Large Language Models (LLMs) have become a cornerstone of modern natural language processing, achieving strong performance across diverse tasks. Despite these advances, their tendency to generate hallucinated or factually unsupported content remains a critical challenge for reliable deployment. Existing evaluation approaches predominantly rely on single-task settings and aggregate performance metrics, implicitly assuming that hallucination behavior is uniform across tasks. However, this assumption is fundamentally flawed, as hallucination characteristics vary significantly depending on task formulation, linguistic context, and evaluation criteria. To address these limitations, this paper proposes HalluBench, a task-aware multi-LLM benchmarking framework designed for systematic hallucination analysis and metric-task alignment. The framework evaluates ten language models across four representative task formulations—open-domain question answering, cross-lingual question answering, scientific claim verification, and LLM-as-a-judge assessment—using four benchmark datasets (five evaluation splits) and nine complementary evaluation metrics. Unlike conventional approaches, HalluBench introduces a metric–task alignment strategy that selects evaluation metrics based on their suitability for each task. Experimental results reveal that hallucination behavior is strongly task-dependent, with substantial variations observed across models and evaluation settings. Specifically, the proposed framework demonstrates that model reliability is highly sensitive to task formulation; for instance, in adversarial open-domain settings, performance differences of up to 15% in Exact Match (EM) and 20% in F1 scores are observed between top-tier and compact (<mml:math id="mml-ieqn-1"><mml:mo>∼</mml:mo></mml:math>1B parameter) models. By integrating lexical, semantic, and reference-based metrics within a pipeline, HalluBench provides a more robust and diagnostically informative evaluation framework compared to traditional single-task and single-metric benchmarks.},
DOI = {10.32604/cmc.2026.081260}
}