# NOT RUN {
# Generate a big data.table with a relatively many columns
set.seed(1L)
dt = lapply(1:20, function(x) sample(c(-100:100), 5e6L, TRUE))
setDT(dt)[, id := sample(1e5, 5e6, TRUE)]
print(object.size(dt), units="Mb") # 400MB, not huge, but will do
# 'order' optimisation
options(datatable.optimize = 1L) # optimisation 'on'
system.time(ans1 <- dt[order(id)])
options(datatable.optimize = 0L) # optimisation 'off'
system.time(ans2 <- dt[order(id)])
identical(ans1, ans2)
# optimisation of 'lapply(.SD, fun)'
options(datatable.optimize = 1L) # optimisation 'on'
system.time(ans1 <- dt[, lapply(.SD, min), by=id])
options(datatable.optimize = 0L) # optimisation 'off'
system.time(ans2 <- dt[, lapply(.SD, min), by=id])
identical(ans1, ans2)
# optimisation of 'mean'
options(datatable.optimize = 1L) # optimisation 'on'
system.time(ans1 <- dt[, lapply(.SD, mean), by=id])
system.time(ans2 <- dt[, lapply(.SD, base::mean), by=id])
identical(ans1, ans2)
# optimisation of 'c(.N, lapply(.SD, ))'
options(datatable.optimize = 1L) # optimisation 'on'
system.time(ans1 <- dt[, c(.N, lapply(.SD, min)), by=id])
options(datatable.optimize = 0L) # optimisation 'off'
system.time(ans2 <- dt[, c(N=.N, lapply(.SD, min)), by=id])
identical(ans1, ans2)
# GForce
options(datatable.optimize = 2L) # optimisation 'on'
system.time(ans1 <- dt[, lapply(.SD, median), by=id])
system.time(ans2 <- dt[, lapply(.SD, function(x) as.numeric(stats::median(x))), by=id])
identical(ans1, ans2)
# restore optimization
options(datatable.optimize = Inf)
# auto indexing
options(datatable.auto.index = FALSE)
system.time(ans1 <- dt[id == 100L]) # vector scan
system.time(ans2 <- dt[id == 100L]) # vector scan
system.time(dt[id <!-- %in% 100:500]) # vector scan -->
options(datatable.auto.index = TRUE)
system.time(ans1 <- dt[id == 100L]) # index + binary search subset
system.time(ans2 <- dt[id == 100L]) # only binary search subset
system.time(dt[id <!-- %in% 100:500]) # only binary search subset again -->
# }
Run the code above in your browser using DataLab