
@Article{cmc.2020.010265,
AUTHOR = {Zijian Li, Chengying Chi, Yunyun Zhan},
TITLE = {Corpus Augmentation for Improving Neural Machine Translation},
JOURNAL = {Computers, Materials \& Continua},
VOLUME = {64},
YEAR = {2020},
NUMBER = {1},
PAGES = {637--650},
URL = {http://www.techscience.com/cmc/v64n1/39164},
ISSN = {1546-2226},
ABSTRACT = {The translation quality of neural machine translation (NMT) systems depends 
largely on the quality of large-scale bilingual parallel corpora available. Research shows 
that under the condition of limited resources, the performance of NMT is greatly reduced, 
and a large amount of high-quality bilingual parallel data is needed to train a competitive 
translation model. However, not all languages have large-scale and high-quality bilingual 
corpus resources available. In these cases, improving the quality of the corpora has 
become the main focus to increase the accuracy of the NMT results. This paper proposes 
a new method to improve the quality of data by using data cleaning, data expansion, and 
other measures to expand the data at the word and sentence-level, thus improving the 
richness of the bilingual data. The long short-term memory (LSTM) language model is 
also used to ensure the smoothness of sentence construction in the process of sentence 
construction. At the same time, it uses a variety of processing methods to improve the 
quality of the bilingual data. Experiments using three standard test sets are conducted to 
validate the proposed method; the most advanced fairseq-transformer NMT system is 
used in the training. The results show that the proposed method has worked well on 
improving the translation results. Compared with the state-of-the-art methods, the BLEU 
value of our method is increased by 2.34 compared with that of the baseline.},
DOI = {10.32604/cmc.2020.010265}
}



