
@Article{jbd.2021.016674,
AUTHOR = {Mingyang Duan, Jin Liu, Shiqi Lv},
TITLE = {Encoder-Decoder Based Multi-Feature Fusion Model for Image Caption  Generation},
JOURNAL = {Journal on Big Data},
VOLUME = {3},
YEAR = {2021},
NUMBER = {2},
PAGES = {77--83},
URL = {http://www.techscience.com/jbd/v3n2/42218},
ISSN = {2579-0056},
ABSTRACT = {Image caption generation is an essential task in computer vision and 
image understanding. Contemporary image caption generation models usually 
use the encoder-decoder model as the underlying network structure. However, in 
the traditional Encoder-Decoder architectures, only the global features of the 
images are extracted, while the local information of the images is not well 
utilized. This paper proposed an Encoder-Decoder model based on fused features 
and a novel mechanism for correcting the generated caption text. We use VGG16 
and Faster R-CNN to extract global and local features in the encoder first. Then, 
we train the bidirectional LSTM network with the fused features in the decoder. 
Finally, the local features extracted is used to correct the caption text. The 
experiment results prove that the effectiveness of the proposed method.},
DOI = {10.32604/jbd.2021.016674}
}



