@Article{cmc.2020.010265, AUTHOR = {Zijian Li, Chengying Chi, *, Yunyun Zhan}, TITLE = {Corpus Augmentation for Improving Neural Machine Translation}, JOURNAL = {Computers, Materials \& Continua}, VOLUME = {64}, YEAR = {2020}, NUMBER = {1}, PAGES = {637--650}, URL = {http://www.techscience.com/cmc/v64n1/39164}, ISSN = {1546-2226}, ABSTRACT = {The translation quality of neural machine translation (NMT) systems depends largely on the quality of large-scale bilingual parallel corpora available. Research shows that under the condition of limited resources, the performance of NMT is greatly reduced, and a large amount of high-quality bilingual parallel data is needed to train a competitive translation model. However, not all languages have large-scale and high-quality bilingual corpus resources available. In these cases, improving the quality of the corpora has become the main focus to increase the accuracy of the NMT results. This paper proposes a new method to improve the quality of data by using data cleaning, data expansion, and other measures to expand the data at the word and sentence-level, thus improving the richness of the bilingual data. The long short-term memory (LSTM) language model is also used to ensure the smoothness of sentence construction in the process of sentence construction. At the same time, it uses a variety of processing methods to improve the quality of the bilingual data. Experiments using three standard test sets are conducted to validate the proposed method; the most advanced fairseq-transformer NMT system is used in the training. The results show that the proposed method has worked well on improving the translation results. Compared with the state-of-the-art methods, the BLEU value of our method is increased by 2.34 compared with that of the baseline.}, DOI = {10.32604/cmc.2020.010265} }