library(data.table)
df1 <- data.frame(matrix(nrow = 50000, ncol = 1000))
df2 <- data.frame(matrix(nrow = 50000, ncol = 1000))
setDT(df1)
setDT(df2)
gc() # check memory usage...380MB?
DTfillNA(df2, value = 0, low_mem = TRUE, collect = 20, silent = TRUE)
gc() # check memory usage peak... 600MB?
rm(df2)
gc() # 200MB only, lets try with only 1 frame left...
df1[is.na(df1)] <- 0
gc() # with 1 data.table less, memory still peaked to 850MB (200MB->850MB)
# e.g it took at least 3.5X more memory than the object alone
df2 <- data.frame(matrix(nrow = 50000, ncol = 1000))
setDT(df2)
DTfillNA(df2, value = 0, low_mem = TRUE, collect = 20, silent = TRUE)
gc() # all good
identical(df1, df2) # TRUE => the same...
rm(df1, df2)
gc(reset = TRUE)
# Let's try to make a copy
df1 <- data.frame(matrix(nrow = 50000, ncol = 1000))
df2 <- DTfillNA(df1, value = 99, low_mem = FALSE, collect = 50, silent = TRUE)
gc() # only 650MB, much better than doing df2 <- df1; df2[is.na(df2)] <- 99
rm(df1, df2)
gc(reset = TRUE)
# This can't be done in R "easily" without hackery ways (fill 1 to 1000 by column)
df1 <- data.frame(matrix(nrow = 50000, ncol = 1000))
df2 <- DTfillNA(df1, value = 1:1000, low_mem = FALSE, collect = 50, silent = TRUE)
gc() # only 650MB
# You can do this on data.frame too...
# It will NOT coerce to data.table
# Just remember it doesn't update in real time in RStudio
df2 <- data.frame(matrix(nrow = 50000, ncol = 1000))
DTfillNA(df2, value = 1:1000, low_mem = TRUE, collect = 50, silent = TRUE)
head(df2)
is.data.table(df2) # FALSE, we did in-place replacement without parent.env hehe
Run the code above in your browser using DataLab