@Article{cmes.2026.080403,
AUTHOR = {Aravapalli Rama Satish, Sai Babu Veesam, Shonak Bansal, Krishna Prakash, Mohammad Rashed Iqbal Faruque},
TITLE = {Causal Cross-Modal Context Fusion for Real-Time Video Summarization with Predictive Tracking and Validated Adaptive Evaluation},
JOURNAL = {Computer Modeling in Engineering \& Sciences},
VOLUME = {},
YEAR = {},
NUMBER = {},
PAGES = {{pages}},
URL = {http://www.techscience.com/CMES/online/detail/27127},
ISSN = {1526-1506},
ABSTRACT = {Real-time video streams now flood everything from security cameras to social media, yet current summarization systems still stumble when audio, visual, and semantic cues unfold with tangled cause–and–effect patterns. Most cross-modal transformers treat correlations as if time were a flat canvas, ignoring how an early sound might trigger a later visual event in the process. They also lack mechanisms to predict tracking uncertainty, adapt to narrative shifts, or evolve their own evaluation criteria, leaving summaries brittle and often incoherent in process. To address these gaps, we propose a Cross-Modal Context Fusion framework built from five tightly linked components. A Temporal-Causal Graph Memory Network captures directional cause–and–effect edges across audio, video, and semantic signals, improving the logical flow of detected key segments. Additionally, a Predictive Entropy Reinforcement Engine learns camera focus and keyframe decisions that minimize future uncertainty, thereby stabilizing tracking under rapid motion or noise. The Cross-Modal Residual Synergy Transformer explicitly models discrepancies, such as off-screen speech, and feeds those residuals back to refine the fusion. For long-form narrative coherence, a Dynamic Hierarchical Context Predictor alternates between micro-actions and macro-story arcs, balancing fine detail with global structure. Finally, a Self-Evolving Evaluation Loop meta-learns to adjust loss weights as deployment contexts shift, sustaining performance without costly full retraining sets. Experiments on SumMe, TVSum, and long-form documentaries indicate up to 15% F1 and 12% ROUGE-L gains, with human studies reporting 18% higher perceived coherence and &gt;90% sustained approval in process. The result is a video summarizer that reasons causally, anticipates uncertainty, adapts its own metrics, and delivers concise yet narratively faithful summaries suited for demanding real-time applications.},
DOI = {10.32604/cmes.2026.080403}
}