@Article{cmc.2024.057453,
AUTHOR = {Christine Dewi, Hanna Prillysca Chernovita, Stephen Abednego Philemon, Christian Adi Ananta, Abbott Po Shun Chen},
TITLE = {Adjusted Reasoning Module for Deep Visual Question Answering Using Vision Transformer},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {81},
YEAR = {2024},
NUMBER = {3},
PAGES = {4195--4216},
URL = {http://www.techscience.com/cmc/v81n3/59043},
ISSN = {1546-2226},
ABSTRACT = {Visual Question Answering (VQA) is an interdisciplinary artificial intelligence (AI) activity that integrates computer vision and natural language processing. Its purpose is to empower machines to respond to questions by utilizing visual information. A VQA system typically takes an image and a natural language query as input and produces a textual answer as output. One major obstacle in VQA is identifying a successful method to extract and merge textual and visual data. We examine “Fusion” Models that use information from both the text encoder and picture encoder to efficiently perform the visual question-answering challenge. For the transformer model, we utilize BERT and RoBERTa, which analyze textual data. The image encoder designed for processing image data utilizes ViT (Vision Transformer), Deit (Data-efficient Image Transformer), and BeIT (Image Transformers). The reasoning module of VQA was updated and layer normalization was incorporated to enhance the performance outcome of our effort. In comparison to the results of previous research, our proposed method suggests a substantial enhancement in efficacy. Our experiment obtained a 60.4% accuracy with the PathVQA dataset and a 69.2% accuracy with the VizWiz dataset.},
DOI = {10.32604/cmc.2024.057453}
}