@inproceedings{bcafd8fb98dd43728b62143042eae94a,
title = "Automatic construction of discourse corpora for dialogue translation",
abstract = "In this paper, a novel approach is proposed to automatically construct parallel discourse corpus for dialogue machine translation. Firstly, the parallel subtitle data and its corresponding monolingual movie script data are crawled and collected from Internet. Then tags such as speaker and discourse boundary from the script data are projected to its subtitle data via an information retrieval approach in order to map monolingual discourse to bilingual texts. We not only evaluate the mapping results, but also integrate speaker information into the translation. Experiments show our proposed method can achieve 81.79% and 98.64% accuracy on speaker and dialogue boundary annotation, and speaker-based language model adaptation can obtain around 0.5 BLEU points improvement in translation qualities. Finally, we publicly release around 100K parallel discourse data with manual speaker and dialogue boundary annotation.",
keywords = "Dialogue, Discourse corpus, Information retrieval, Machine translation, Movie script, Movie subtitle",
author = "Longyue Wang and Xiaojun Zhang and Zhaopeng Tu and Andy Way and Qun Liu",
note = "Funding Information: This work is supported by the Science Foundation of Ireland (SFI) ADAPT project (Grant No.:13/RC/2106), and partly supported by the DCU-Huawei Joint Project (Grant No.:201504032-A, YB2015090061). It is partly supported by the Open Projects Program of National Laboratory of Pattern Recognition (Grant 201407353) and the Open Projects Program of Centre of Translation of GDUFS (Grant CTS201501).; 10th International Conference on Language Resources and Evaluation, LREC 2016 ; Conference date: 23-05-2016 Through 28-05-2016",
year = "2016",
language = "English",
series = "Proceedings of the 10th International Conference on Language Resources and Evaluation, LREC 2016",
publisher = "European Language Resources Association (ELRA)",
pages = "2748--2754",
editor = "Nicoletta Calzolari and Khalid Choukri and Helene Mazo and Asuncion Moreno and Thierry Declerck and Sara Goggi and Marko Grobelnik and Jan Odijk and Stelios Piperidis and Bente Maegaard and Joseph Mariani",
booktitle = "Proceedings of the 10th International Conference on Language Resources and Evaluation, LREC 2016",
}