@inproceedings{5094c1d42c754792b26c10e3eb4aaf44,
title = "Record-aware compression for big textual data analysis acceleration",
abstract = "Big data analysis technologies are becoming more widely used in industry. The ever-increasing data volume, however, puts data analytic platforms such as Hadoop under constant pressure. Several compression methods have been made available on the Hadoop platform to effectively reduce data size and efficiently deliver data between cluster nodes. In the Hadoop context, compressed data can be categorized as splittable or non-splittable. Working with non-splittable data conflicts with the goal of parallelism. In addition, the current realization of splittable data by indexing is potentially harmful to the data locality property. To this end, we introduce the Record-aware Compression (RaC) scheme that makes the compressed contents splittable, uses a lightweight Hadoop Record Reader, and preserves the parallelism and data locality properties as much as possible. We evaluate RaC using a set of classical MapReduce jobs with a collection of well-known datasets from companies such as Google, Yahoo!, and Amazon. The experimental results show an average 24% improvement on analysis performance and up to 75% data size reduction.",
keywords = "Big Data, Compression, Hadoop, MapReduce, Record-aware",
author = "Dapeng Dong and John Herbert",
note = "Publisher Copyright: {\textcopyright} 2015 IEEE.; 3rd IEEE International Conference on Big Data, IEEE Big Data 2015 ; Conference date: 29-10-2015 Through 01-11-2015",
year = "2015",
month = dec,
day = "22",
doi = "10.1109/BigData.2015.7363872",
language = "English",
series = "Proceedings - 2015 IEEE International Conference on Big Data, IEEE Big Data 2015",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1183--1190",
editor = "Feng Luo and Kemafor Ogan and Zaki, {Mohammed J.} and Laura Haas and Ooi, {Beng Chin} and Vipin Kumar and Sudarsan Rachuri and Saumyadipta Pyne and Howard Ho and Xiaohua Hu and Shipeng Yu and Hsiao, {Morris Hui-I} and Jian Li",
booktitle = "Proceedings - 2015 IEEE International Conference on Big Data, IEEE Big Data 2015",
}