## Not run:
#
# # Demo speedup
# n=1e6
# DT = data.table( a=sample(1:1000,n,replace=TRUE),
# b=sample(1:1000,n,replace=TRUE),
# c=rnorm(n),
# d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE),
# e=rnorm(n),
# f=sample(1:1000,n,replace=TRUE) )
# DT[2,b:=NA_integer_]
# DT[4,c:=NA_real_]
# DT[3,d:=NA_character_]
# DT[5,d:=""]
# DT[2,e:=+Inf]
# DT[3,e:=-Inf]
#
# write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
# cat("File size (MB):", round(file.info("test.csv")$size/1024^2),"\n")
# # 50 MB (1e6 rows x 6 columns)
#
# system.time(DF1 <-read.csv("test.csv",stringsAsFactors=FALSE))
# # 60 sec (first time in fresh R session)
#
# system.time(DF1 <- read.csv("test.csv",stringsAsFactors=FALSE))
# # 30 sec (immediate repeat is faster, varies)
#
# system.time(DF2 <- read.table("test.csv",header=TRUE,sep=",",quote="",
# stringsAsFactors=FALSE,comment.char="",nrows=n,
# colClasses=c("integer","integer","numeric",
# "character","numeric","integer")))
# # 10 sec (consistently). All known tricks and known nrows, see references.
#
# require(data.table)
# system.time(DT <- fread("test.csv"))
# # 3 sec (faster and friendlier)
#
# require(sqldf)
# system.time(SQLDF <- read.csv.sql("test.csv",dbname=NULL))
# # 20 sec (friendly too, good defaults)
#
# require(ff)
# system.time(FFDF <- read.csv.ffdf(file="test.csv",nrows=n))
# # 20 sec (friendly too, good defaults)
#
# identical(DF1,DF2)
# all.equal(as.data.table(DF1), DT)
# identical(DF1,within(SQLDF,{b<-as.integer(b);c<-as.numeric(c)}))
# identical(DF1,within(as.data.frame(FFDF),d<-as.character(d)))
#
# # Scaling up ...
# l = vector("list",10)
# for (i in 1:10) l[[i]] = DT
# DTbig = rbindlist(l)
# tables()
# write.table(DTbig,"testbig.csv",sep=",",row.names=FALSE,quote=FALSE)
# # 500MB (10 million rows x 6 columns)
#
# system.time(DF <- read.table("testbig.csv",header=TRUE,sep=",",
# quote="",stringsAsFactors=FALSE,comment.char="",nrows=1e7,
# colClasses=c("integer","integer","numeric",
# "character","numeric","integer")))
# # 100-200 sec (varies)
#
# system.time(DT <- fread("testbig.csv"))
# # 30-40 sec
#
# all(mapply(all.equal, DF, DT))
#
#
# # Real data example (Airline data)
# # http://stat-computing.org/dataexpo/2009/the-data.html
#
# download.file("http://stat-computing.org/dataexpo/2009/2008.csv.bz2",
# destfile="2008.csv.bz2")
# # 109MB (compressed)
#
# system("bunzip2 2008.csv.bz2")
# # 658MB (7,009,728 rows x 29 columns)
#
# colClasses = sapply(read.csv("2008.csv",nrows=100),class)
# # 4 character, 24 integer, 1 logical. Incorrect.
#
# colClasses = sapply(read.csv("2008.csv",nrows=200),class)
# # 5 character, 24 integer. Correct. Might have missed data only using 100 rows
# # since read.table assumes colClasses is correct.
#
# system.time(DF <- read.table("2008.csv", header=TRUE, sep=",",
# quote="",stringsAsFactors=FALSE,comment.char="",nrows=7009730,
# colClasses=colClasses)
# # 360 secs
#
# system.time(DT <- fread("2008.csv"))
# # 40 secs
#
# table(sapply(DT,class))
# # 5 character and 24 integer columns. Correct without needing to worry about colClasses
# # issue above.
#
#
# # Reads URLs directly :
# fread("http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat")
#
# ## End(Not run)
# Reads text input directly :
fread("A,B\n1,2\n3,4")
# Reads pasted input directly :
fread("A,B
1,2
3,4
")
# Finds the first data line automatically :
fread("
This is perhaps a banner line or two or ten.
A,B
1,2
3,4
")
# Detects whether column names are present automatically :
fread("
1,2
3,4
")
# Numerical precision :
DT = fread("A\n1.010203040506070809010203040506\n") # silent loss of precision
DT[,sprintf("%.15E",A)] # stored accurately as far as double precision allows
DT = fread("A\n1.46761e-313\n") # detailed warning about ERANGE; read as 'numeric'
DT[,sprintf("%.15E",A)] # beyond what double precision can store accurately to 15 digits
# For greater accuracy use colClasses to read as character, then package Rmpfr.
# colClasses
data = "A,B,C,D\n1,3,5,7\n2,4,6,8\n"
fread(data, colClasses=c(B="character",C="character",D="character")) # as read.csv
fread(data, colClasses=list(character=c("B","C","D"))) # saves typing
fread(data, colClasses=list(character=2:4)) # same using column numbers
# drop
fread(data, colClasses=c("B"="NULL","C"="NULL")) # as read.csv
fread(data, colClasses=list(NULL=c("B","C"))) #
fread(data, drop=c("B","C")) # same but less typing, easier to read
fread(data, drop=2:3) # same using column numbers
# select
# (in read.csv you need to work out which to drop)
fread(data, select=c("A","D")) # less typing, easier to read
fread(data, select=c(1,4)) # same using column numbers
# skip blank lines
fread("a,b\n1,a\n2,b\n\n\n3,c\n", blank.lines.skip=TRUE)
# fill
fread("a,b\n1,a\n2\n3,c\n", fill=TRUE)
fread("a,b\n\n1,a\n2\n\n3,c\n\n", fill=TRUE)
# fill with skip blank lines
fread("a,b\n\n1,a\n2\n\n3,c\n\n", fill=TRUE, blank.lines.skip=TRUE)
# check.names usage
fread("a b,a b\n1,2\n")
fread("a b,a b\n1,2\n", check.names=TRUE) # no duplicates + syntactically valid names
Run the code above in your browser using DataLab