message("create some csv data on disk")
x <- data.frame(
log=rep(c(FALSE, TRUE), length.out=26)
, int=1:26
, dbl=1:26 + 0.1
, fac=factor(letters)
, ord=ordered(LETTERS)
, dct=Sys.time()+1:26
, dat=seq(as.Date("1910/1/1"), length.out=26, by=1)
, stringsAsFactors = TRUE
)
x <- x[c(13:1, 13:1),]
csvfile <- tempPathFile(path=getOption("fftempdir"), extension="csv")
write.csv(x, file=csvfile, row.names=FALSE)
cat("Simply read csv with header\n")
y <- read.csv(file=csvfile, header=TRUE)
y
cat("Read csv with header\n")
ffy <- read.csv.ffdf(file=csvfile, header=TRUE)
ffy
sapply(ffy[,], class)
message("reading with colClasses (an ordered factor wont'work in read.csv)")
try(read.csv(file=csvfile, header=TRUE, colClasses=c(ord="ordered")
, stringsAsFactors = TRUE))
# TODO could fix this with the following two commands (Gabor Grothendieck)
# but does not know what bad side-effects this could have
#setOldClass("ordered")
#setAs("character", "ordered", function(from) ordered(from))
y <- read.csv(file=csvfile, header=TRUE, colClasses=c(dct="POSIXct", dat="Date")
, stringsAsFactors = TRUE)
ffy <- read.csv.ffdf(
file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
)
rbind(
ram_class = sapply(y, function(x)paste(class(x), collapse = ","))
, ff_class = sapply(ffy[,], function(x)paste(class(x), collapse = ","))
, ff_vmode = vmode(ffy)
)
message("NOTE that reading in chunks can change the sequence of levels and thus the coding")
message("(Sorting levels during chunked reading can be too expensive)")
levels(ffy$fac[])
ffy <- read.csv.ffdf(
file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
, first.rows=6
, next.rows=10
, VERBOSE=TRUE
)
levels(ffy$fac[])
message("If we don't know the levels we can sort then after reading")
message("(Will rewrite all factor codes)")
message("NOTE that you MUST assign the return value of sortLevels()")
ffy <- sortLevels(ffy)
levels(ffy$fac[])
message("If we KNOW the levels we can fix levels upfront")
ffy <- read.csv.ffdf(
file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
, first.rows=6
, next.rows=10
, levels=list(fac=letters, ord=LETTERS)
)
levels(ffy$fac[])
message("Or we inspect a sufficiently large chunk of data and use those")
table(ffy$fac[], exclude=NULL)
ffy <- read.csv.ffdf(
file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
, nrows=13
, VERBOSE=TRUE
)
message("append the rest to ffy")
ffy <- read.csv.ffdf(
x=ffy
, file=csvfile
, header=FALSE
, skip=1 + nrow(ffy)
, VERBOSE=TRUE
)
table(ffy$fac[], exclude=NULL)
message("We can turn unexpected factor levels to NA, say we only allowed a:l")
ffy <- read.csv.ffdf(
file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
, levels=list(fac=letters[1:12], ord=LETTERS[1:12])
, appendLevels=FALSE
)
sapply(colnames(ffy), function(i)sum(is.na(ffy[[i]][])))
message("let's store some columns more efficient")
sum(.ffbytes[vmode(ffy)])
ffy$log <- clone(ffy$log, vmode="boolean")
ffy$fac <- clone(ffy$fac, vmode="byte")
ffy$ord <- clone(ffy$ord, vmode="byte")
sum(.ffbytes[vmode(ffy)])
message("let's make a template with zero rows")
ffx <- clone(ffy)
nrow(ffx) <- 0
message("reading with template and colClasses")
ffy <- read.csv.ffdf(
x=ffx
, file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
, next.rows = 12
, VERBOSE = TRUE
)
rbind(
ff_class = sapply(ffy[,], function(x)paste(class(x), collapse = ","))
, ff_vmode = vmode(ffy)
)
levels(ffx$fac[])
levels(ffy$fac[])
message("reading with template without colClasses")
ffy <- read.csv.ffdf(
x=ffx
, file=csvfile
, header=TRUE
, next.rows = 12
, VERBOSE = TRUE
)
rbind(
ff_class = sapply(ffy[,], function(x)paste(class(x), collapse = ","))
, ff_vmode = vmode(ffy)
)
levels(ffx$fac[])
levels(ffy$fac[])
message("We can fine-tune the creation of the ffdf")
message("- let's create the ff files outside of fftempdir")
message("- let's reduce required disk space and thus file.system cache RAM")
message("By default we had record size 36.25")
ffy <- read.csv.ffdf(
file=csvfile
, header=TRUE
, colClasses=c(ord="ordered", dct="POSIXct", dat="Date")
, asffdf_args=list(
vmode = c(
log="boolean"
, int="byte"
, dbl="single"
, fac="nibble" # no NAs
, ord="nibble" # no NAs
, dct="single"
, dat="single"
)
, col_args=list(pattern = "./csv") # create in getwd() with prefix csv
)
)
vmode(ffy)
message("This recordsize is more than 50% reduced")
sum(.ffbytes[vmode(ffy)]) / 36.25
message("Don't forget to wrap-up files that are not in fftempdir")
delete(ffy); rm(ffy)
message("It's a good habit to also wrap-up temporary stuff (or at least know how this is done)")
rm(ffx); gc()
fwffile <- tempfile()
cat(file=fwffile, "123456", "987654", sep="\n")
x <- read.fwf(fwffile, widths=c(1,2,3), stringsAsFactors = TRUE) #> 1 23 456 \ 9 87 654
y <- read.table.ffdf(file=fwffile, FUN="read.fwf", widths=c(1,2,3))
stopifnot(identical(x, y[,]))
x <- read.fwf(fwffile, widths=c(1,-2,3), stringsAsFactors = TRUE) #> 1 456 \ 9 654
y <- read.table.ffdf(file=fwffile, FUN="read.fwf", widths=c(1,-2,3))
stopifnot(identical(x, y[,]))
unlink(fwffile)
cat(file=fwffile, "123", "987654", sep="\n")
x <- read.fwf(fwffile, widths=c(1,0, 2,3), stringsAsFactors = TRUE) #> 1 NA 23 NA \ 9 NA 87 654
y <- read.table.ffdf(file=fwffile, FUN="read.fwf", widths=c(1,0, 2,3))
stopifnot(identical(x, y[,]))
unlink(fwffile)
cat(file=fwffile, "123456", "987654", sep="\n")
x <- read.fwf(fwffile, widths=list(c(1,0, 2,3), c(2,2,2))
, stringsAsFactors = TRUE) #> 1 NA 23 456 98 76 54
y <- read.table.ffdf(file=fwffile, FUN="read.fwf", widths=list(c(1,0, 2,3), c(2,2,2)))
stopifnot(identical(x, y[,]))
unlink(fwffile)
# \dontshow{
x <- read.csv(file=csvfile, header=TRUE, stringsAsFactors = TRUE)
y <- read.csv.ffdf(file=csvfile, header=TRUE)
stopifnot(identical(x, y[,]))
y <- read.csv.ffdf(file=csvfile, header=TRUE, nrows=13)
stopifnot(identical(x[1:13,], y[,]))
y <- read.csv.ffdf(file=csvfile, header=TRUE, first.rows=12)
y <- sortLevels(y)
stopifnot(identical(x, y[,]))
y <- read.csv.ffdf(file=csvfile, header=TRUE, nrows=13, first.rows=12)
y <- sortLevels(y)
stopifnot(identical(x[1:13,], y[,]))
y <- read.csv.ffdf(file=csvfile, header=TRUE, nrows=12, first.rows=12)
y <- sortLevels(y)
stopifnot(!identical(x[1:12,], y[,]))
stopifnot(identical(as.character(as.matrix(x[1:12,])), as.character(as.matrix(y[,]))))
y <- read.csv.ffdf(file=csvfile, header=TRUE, nrows=11, first.rows=12)
y <- sortLevels(y)
stopifnot(!identical(x[1:11,], y[,]))
stopifnot(identical(as.character(as.matrix(x[1:11,])), as.character(as.matrix(y[,]))))
y <- read.csv.ffdf(file=csvfile, header=TRUE, first.rows=-1)
stopifnot(identical(x, y[,]))
y <- read.csv.ffdf(file=csvfile, header=TRUE, nrows=13, first.rows=12, appendLevels=c(ord=FALSE))
stopifnot(is.na(y$ord[13]) && !is.na(y$fac[13]))
# }
unlink(csvfile)
Run the code above in your browser using DataLab