# NOT RUN {
# input data on HDFS
d <- ddf(hdfsConn("/path/to/big/data/on/hdfs"))
# set RHIPE / Hadoop parameters
# buffer sizes control how many k/v pairs are sent to map / reduce tasks at a time
# mapred.reduce.tasks is a Hadoop config parameter that controls # of reduce tasks
rhctl <- rhipeControl(mapred = list(
rhipe_map_buff_size = 10000,
mapred.reduce.tasks = 72,
rhipe_reduce_buff_size = 1)
# divide input data using these control parameters
divide(d, by = "var", output = hdfsConn("/path/to/output"), control = rhctl)
# }
Run the code above in your browser using DataLab