@Article{cmc.2025.065529,
AUTHOR = {Xian Yu, Jianxun Zhang, Siran Tian, Xiaobao He},
TITLE = {Optimizing Semantic and Texture Consistency in Video Generation},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {85},
YEAR = {2025},
NUMBER = {1},
PAGES = {1883--1897},
URL = {http://www.techscience.com/cmc/v85n1/63519},
ISSN = {1546-2226},
ABSTRACT = {In recent years, diffusion models have achieved remarkable progress in image generation. However, extending them to text-to-video (T2V) generation remains challenging, particularly in maintaining semantic consistency and visual quality across frames. Existing approaches often overlook the synergy between high-level semantics and low-level texture information, resulting in blurry or temporally inconsistent outputs. To address these issues, we propose Dual Consistency Training (DCT), a novel framework designed to jointly optimize semantic and texture consistency in video generation. Specifically, we introduce a multi-scale spatial adapter to enhance spatial feature extraction, and leverage the complementary strengths of CLIP and VGG—where CLIP focuses on high-level semantics and VGG captures fine-grained texture and detail. During training, a stepwise strategy is adopted to impose semantic and texture losses, constraining discrepancies between generated and ground-truth frames. Furthermore, we propose CLWS, which dynamically adjusts the balance between semantic and texture losses to facilitate more stable and effective optimization. Remarkably, DCT achieves high-quality video generation using only a single training video on a single NVIDIA A6000 GPU. Extensive experiments demonstrate that our method significantly improves temporal coherence and visual fidelity across various video generation tasks, verifying its effectiveness and generalizability.},
DOI = {10.32604/cmc.2025.065529}
}