@inproceedings{d5c48d5a9822429ab76ee3531a7a73fe,
title = "Research on the Construction Method of Chinese - Vietnamese Parallel Corpus",
abstract = "The Chinese-Vietnameseparallel corpus is the basic research problem in the fields of natural language processing. The traditional methods use the DOM tree or element anchors in HTML extract parallel sentences with low accuracy and slow alignment speed. Therefore, this paper proposes a new Web-based Chinese-Vietnamese parallel corpus construction scheme. The scheme will determine the parallel web page through the LDA (Latent Dirichlet Allocation) and Gibbs Sampling. And the BeautifulSoup and regular expression will be used to crawl the webpage text and clean the corpus. The DOM tree and the element anchors in HTML are used to optimize the extraction of parallel sentence pairs. Combined with the sentence length and Champollion algorithm, the dynamic programming algorithm is adopted to improve the correct rate and recall rate of sentence alignment. The program successfully established a million-level Chinese-Vietnamese parallel corpus.",
keywords = "Chinese-Vietnamese parallel corpus, corpus cleaning, parallel web crawling, sentence alignment",
author = "Shiying Tu and Haojin Hu and Ronglyu Sun and Yanmei Jing and Wenxue He",
note = "Funding Information: ACKNOWLEDGMENT This work was supported by Yunnan philosophy and social science planning project (2019QN048). Publisher Copyright: {\textcopyright} 2019 IEEE.; 4th IEEE Advanced Information Technology, Electronic and Automation Control Conference, IAEAC 2019 ; Conference date: 20-12-2019 Through 22-12-2019",
year = "2019",
month = dec,
doi = "10.1109/IAEAC47372.2019.8998000",
language = "English",
series = "Proceedings of 2019 IEEE 4th Advanced Information Technology, Electronic and Automation Control Conference, IAEAC 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "2006--2011",
editor = "Bing Xu and Kefen Mou",
booktitle = "Proceedings of 2019 IEEE 4th Advanced Information Technology, Electronic and Automation Control Conference, IAEAC 2019",
}