@inproceedings{3fc1d5a58d5c495e94cc6e057d0ecb55,
title = "Record-aware two-level compression for big textual data analysis acceleration",
abstract = "An increasing volume of data puts MapReduce data analytic platforms such as Hadoop under constant resource pressure. A new two-phase text compression scheme has been specially designed to accelerate data analysis and reduce cluster resource usage, and this has been implemented for Hadoop. The scheme consists of two levels of compression. The first level compression allows a Hadoop program to consume the compressed data directly, thus reducing the data transmission cost within a cluster during analysis. The second level packages data into fixed-size blocks that respect the logical data records. This further reduces the data size to a size similar to that achieved by a higher-order entropy encoder while also making the compressed data splittable for the HDFS. The use of the compression scheme is made transparent to Hadoop developers by the provided utility functions. The compression scheme is evaluated using a set of standard MapReduce jobs for a selection of real-world datasets. The experimental results show an improvement on analysis performance of up to 72% and compression ratios close to that achieved by a standard compressor such as Bzip.",
keywords = "Big Data, Compression, Content-aware, Hadoop, MapReduce, Record-aware",
author = "Dapeng Dong and John Herbert",
note = "Publisher Copyright: {\textcopyright} 2015 IEEE.; 7th IEEE International Conference on Cloud Computing Technology and Science, CloudCom 2015 ; Conference date: 30-11-2015 Through 03-12-2015",
year = "2016",
month = feb,
day = "1",
doi = "10.1109/CloudCom.2015.32",
language = "English",
series = "Proceedings - IEEE 7th International Conference on Cloud Computing Technology and Science, CloudCom 2015",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "9--16",
booktitle = "Proceedings - IEEE 7th International Conference on Cloud Computing Technology and Science, CloudCom 2015",
}