@Article{jbd.2021.017169,
AUTHOR = {Yue Li, Jin Liu, Shengjie Shang},
TITLE = {WMA: A Multi-Scale Self-Attention Feature Extraction Network Based on  Weight Sharing for VQA},
JOURNAL = {Journal on Big Data},
VOLUME = {3},
YEAR = {2021},
NUMBER = {3},
PAGES = {111--118},
URL = {http://www.techscience.com/jbd/v3n3/45672},
ISSN = {2579-0056},
ABSTRACT = {Visual Question Answering (VQA) has attracted extensive research 
focus and has become a hot topic in deep learning recently. The development of 
computer vision and natural language processing technology has contributed to 
the advancement of this research area. Key solutions to improve the performance 
of VQA system exist in feature extraction, multimodal fusion, and answer 
prediction modules. There exists an unsolved issue in the popular VQA image 
feature extraction module that extracts the fine-grained features from objects of 
different scale difficultly. In this paper, a novel feature extraction network that 
combines multi-scale convolution and self-attention branches to solve the above 
problem is designed. Our approach achieves the state-of-the-art performance of a 
single model on Pascal VOC 2012, VQA 1.0, and VQA 2.0 datasets.},
DOI = {10.32604/jbd.2021.017169}
}