# NOT RUN {
#create toy data
(traindata <- data.frame(var1=as.factor(c("a","b","b","c")),
var2=as.factor(c(1,1,2,3)),
var3=c("val1","val2","val3","val3"),
stringsAsFactors=FALSE))
(newdata <- data.frame(var1=as.factor(c("a","b","b","c","d","d")),
var2=as.factor(c(1,1,2,3,4,5)),
var3=c("val1","val2","val3","val3","val4","val4"),
stringsAsFactors=FALSE))
#create dummies of training set
(dummies_train <- dummy(x=traindata))
#create dummies of new set
(dummies_new <- dummy(x=newdata))
#how many new dummy variables should not have been created?
sum(! colnames(dummies_new) %in% colnames(dummies_train))
#create dummies of new set using categories found in training set
(dummies_new <- dummy(x=newdata,object=categories(traindata,p="all")))
#how many new dummy variables should not have be created?
sum(! colnames(dummies_new) %in% colnames(dummies_train))
#create dummies of training set,
#using the top 2 categories of all variables found in the training data
dummy(x=traindata,p=2)
#create dummies of training set,
#using respectively the top 2,3 and 1 categories of the three
#variables found in training data
dummy(x=traindata,p=c(2,3,1))
#create all dummies of training data
dummy(x=traindata)
# }
# NOT RUN {
#######################
#example ref parameter
#ref=TRUE, example 1
(DT = data.table(a=c("a","b"),b=c("c","c")))
dummy(DT,ref=TRUE)
DT[] #DT has changed
#ref=TRUE, example 2
#uses exactly same amount of memory as example 1
(DT = data.table(a=c("a","b"),b=c("c","c")))
d1 <- dummy(DT,ref=TRUE)
DT[] #DT has changed
d1[] #d1 is a reference (not a copy) to DT
#ref=FALSE, example 3
#example 1 and 2 are more memory efficient than example 3
(DT = data.table(a=c("a","b"),b=c("c","c")))
d2 <- dummy(DT, ref=FALSE)
DT[] #DT has not changed
d[]
# deleting DT after dummy finishes would result in the same final
# memory footprint as example 1 and 2, except that in example 3
# memory usage is higher when dummy is being executed, and this may be
# problematic when DT is large.
# }
Run the code above in your browser using DataLab