@inproceedings{b9058c8da3464a198f84af34bb19b239,
title = "Handling data skew in MapReduce",
abstract = "MapReduce systems have become popular for processing large data sets and are increasingly being used in e-science applications. In contrast to simple application scenarios like word count, e-science applications involve complex computations which pose new challenges to MapReduce systems. In particular, (a) the runtime complexity of the reducer task is typically high, and (b) scientific data is often skewed. This leads to highly varying execution times for the reducers. Varying execution times result in low resource utilisation and high overall execution time since the next MapReduce cycle can only start after all reducers are done. In this paper we address the problem of efficiently processing MapReduce jobs with complex reducer tasks over skewed data. We define a new cost model that takes into account non-linear reducer tasks and we provide an algorithm to estimate the cost in a distributed environment. We propose two load balancing approaches, fine partitioning and dynamic fragmentation, that are based on our cost model and can deal with both skewed data and complex reduce tasks. Fine partitioning produces a fixed number of data partitions, dynamic fragmentation dynamically splits large partitions into smaller portions and replicates data if necessary. Our approaches can be seamlessly integrated into existing MapReduce systems like Hadoop. We empirically evaluate our solution on both synthetic data and real data from an e-science application.",
keywords = "Data skew, Load balancing, MapReduce",
author = "Benjamin Gufler and Nikolaus Augsten and Angelika Reiser and Alfons Kemper",
year = "2011",
language = "English",
isbn = "9789898425522",
series = "CLOSER 2011 - Proceedings of the 1st International Conference on Cloud Computing and Services Science",
pages = "574--583",
booktitle = "CLOSER 2011 - Proceedings of the 1st International Conference on Cloud Computing and Services Science",
note = "1st International Conference on Cloud Computing and Services Science, CLOSER 2011 ; Conference date: 07-05-2011 Through 09-05-2011",
}