
@Article{cmc.2025.073629,
AUTHOR = {Tomás Bernal-Beltrán, Ronghao Pan, José Antonio García-Díaz, María del Pilar Salas-Zárate, Mario Andrés Paredes-Valverde, Rafael Valencia-García},
TITLE = {Detection of Maliciously Disseminated Hate Speech in Spanish Using Fine-Tuning and In-Context Learning Techniques with Large Language Models},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {87},
YEAR = {2026},
NUMBER = {1},
PAGES = {--},
URL = {http://www.techscience.com/cmc/v87n1/66072},
ISSN = {1546-2226},
ABSTRACT = {The malicious dissemination of hate speech via compromised accounts, automated bot networks and malware-driven social media campaigns has become a growing cybersecurity concern. Automatically detecting such content in Spanish is challenging due to linguistic complexity and the scarcity of annotated resources. In this paper, we compare two predominant AI-based approaches for the forensic detection of malicious hate speech: (1) fine-tuning encoder-only models that have been trained in Spanish and (2) In-Context Learning techniques (Zero- and Few-Shot Learning) with large-scale language models. Our approach goes beyond binary classification, proposing a comprehensive, multidimensional evaluation that labels each text by: (1) type of speech, (2) recipient, (3) level of intensity (ordinal) and (4) targeted group (multi-label). Performance is evaluated using an annotated Spanish corpus, standard metrics such as precision, recall and F1-score and stability-oriented metrics to evaluate the stability of the transition from zero-shot to few-shot prompting (Zero-to-Few Shot Retention and Zero-to-Few Shot Gain) are applied. The results indicate that fine-tuned encoder-only models (notably MarIA and BETO variants) consistently deliver the strongest and most reliable performance: in our experiments their macro F1-scores lie roughly in the range of approximately 46%–66% depending on the task. Zero-shot approaches are much less stable and typically yield substantially lower performance (observed F1-scores range approximately 0%–39%), often producing invalid outputs in practice. Few-shot prompting (e.g., Qwen 3 8B, Mistral 7B) generally improves stability and recall relative to pure zero-shot, bringing F1-scores into a moderate range of approximately 20%–51% but still falling short of fully fine-tuned models. These findings highlight the importance of supervised adaptation and discuss the potential of both paradigms as components in AI-powered cybersecurity and malware forensics systems designed to identify and mitigate coordinated online hate campaigns.},
DOI = {10.32604/cmc.2025.073629}
}



