@inproceedings{d52ca38c95304f46baf033210be2d5ba,
title = "Efficient online stream deduplication for network block storage",
abstract = "Deduplication is an effective technique to optimize storage utilization in data centers and cloud storage systems. It splits data into chunks and then identifies whether chunks are unique or not. Fixed-size chunking (FSC) is widely used in deduplication, which defines the chunk boundary with a fixed interval of bytes. Although it is simple and efficient, FSC may cause boundary shift issue, which usually decreases deduplication rate. Content-defined chunking (CDC) has been proposed to solve this problem. However, there are two challenges to apply CDC in deduplication for network block storage. One challenge is how to establish a mapping scheme between the stream offsets of a deduplicated chunk and its block address; the other challenge is to design an efficient index structure to organize metadata of data chunks on the disk. In this paper, we design two structures to solve the mapping problem and implement two backends to store metadata on network block storage devices, which are based on B+ trees and hash table, respectively. In order to achieve a better search performance on the disk, we reduce the size of the hash table and shrink the lookup range. We evaluate our schemes by real-world workloads. The experimental results show that our schemes have an excellent search performance at an acceptable cost of spatial sacrifice.",
keywords = "B+ trees, CDC, Deduplication, Hash map, Metadata management",
author = "Hongli Lu and Guangping Xu and Bo Tang and Shengli Li and Mian Zhou",
note = "Publisher Copyright: {\textcopyright} 2018 IEEE.; 16th IEEE International Symposium on Parallel and Distributed Processing with Applications, 17th IEEE International Conference on Ubiquitous Computing and Communications, 8th IEEE International Conference on Big Data and Cloud Computing, 11th IEEE International Conference on Social Computing and Networking and 8th IEEE International Conference on Sustainable Computing and Communications, ISPA/IUCC/BDCloud/SocialCom/SustainCom 2018 ; Conference date: 11-12-2018 Through 13-12-2018",
year = "2018",
month = jul,
day = "2",
doi = "10.1109/BDCloud.2018.00029",
language = "English",
series = "Proceedings - 16th IEEE International Symposium on Parallel and Distributed Processing with Applications, 17th IEEE International Conference on Ubiquitous Computing and Communications, 8th IEEE International Conference on Big Data and Cloud Computing, 11th IEEE International Conference on Social Computing and Networking and 8th IEEE International Conference on Sustainable Computing and Communications, ISPA/IUCC/BDCloud/SocialCom/SustainCom 2018",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "111--119",
editor = "Jinjun Chen and Yang, {Laurence T.}",
booktitle = "Proceedings - 16th IEEE International Symposium on Parallel and Distributed Processing with Applications, 17th IEEE International Conference on Ubiquitous Computing and Communications, 8th IEEE International Conference on Big Data and Cloud Computing, 11th IEEE International Conference on Social Computing and Networking and 8th IEEE International Conference on Sustainable Computing and Communications, ISPA/IUCC/BDCloud/SocialCom/SustainCom 2018",
}