library(mgcv)
set.seed(0) ## simulate some data...
dat <- gamSim(1,n=400,dist="normal",scale=2)
b<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),data=dat)
summary(b)
plot(b,pages=1,residuals=TRUE) ## show partial residuals
plot(b,pages=1,seWithMean=TRUE) ## `with intercept' CIs
## same fit in two parts .....
G<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),fit=FALSE,data=dat)
b<-gam(G=G)
print(b)
## change the smoothness selection method to REML
b0<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),data=dat,method="REML")
plot(b,pages=1)
## ... and do automatic terms selection as well
b1<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),data=dat,
method="REML",select=TRUE)
plot(b1,pages=1)
## set the smoothing parameter for the first term, estimate rest ...
bp<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),sp=c(0.01,-1,-1,-1),data=dat)
plot(bp,pages=1)
## alternatively...
bp <- gam(y~s(x0,sp=.01)+s(x1)+s(x2)+s(x3),data=dat)
# set lower bounds on smoothing parameters ....
bp<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),
min.sp=c(0.001,0.01,0,10),data=dat)
print(b);print(bp)
# same with REML
bp<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),
min.sp=c(0.1,0.1,0,10),data=dat,method="REML")
print(b0);print(bp)
## now a GAM with 3df regression spline term & 2 penalized terms
b0<-gam(y~s(x0,k=4,fx=TRUE,bs="tp")+s(x1,k=12)+s(x2,k=15),data=dat)
plot(b0,pages=1)
## now fit a 2-d term to x0,x1
b1<-gam(y~s(x0,x1)+s(x2)+s(x3),data=dat)
par(mfrow=c(2,2))
plot(b1)
par(mfrow=c(1,1))
## now simulate poisson data...
dat <- gamSim(1,n=4000,dist="poisson",scale=.1)
## use "cr" basis to save time, with 4000 data...
b2<-gam(y~s(x0,bs="cr")+s(x1,bs="cr")+s(x2,bs="cr")+
s(x3,bs="cr"),family=poisson,data=dat,method="REML")
plot(b2,pages=1)
## drop x3, but initialize sp's from previous fit, to
## save more time...
b2a<-gam(y~s(x0,bs="cr")+s(x1,bs="cr")+s(x2,bs="cr"),
family=poisson,data=dat,method="REML",
in.out=list(sp=b2$sp[1:3],scale=1))
par(mfrow=c(2,2))
plot(b2a)
par(mfrow=c(1,1))
## similar example using performance iteration
dat <- gamSim(1,n=400,dist="poisson",scale=.25)
b3<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),family=poisson,
data=dat,optimizer="perf")
plot(b3,pages=1)
## repeat using GACV as in Wood 2008...
b4<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),family=poisson,
data=dat,method="GACV.Cp",scale=-1)
plot(b4,pages=1)
## repeat using REML as in Wood 2009...
b5<-gam(y~s(x0)+s(x1)+s(x2)+s(x3),family=poisson,
data=dat,method="REML")
plot(b5,pages=1)
## a binary example (see later for large dataset version)...
dat <- gamSim(1,n=400,dist="binary",scale=.33)
lr.fit <- gam(y~s(x0)+s(x1)+s(x2)+s(x3),family=binomial,
data=dat,method="REML")
## plot model components with truth overlaid in red
op <- par(mfrow=c(2,2))
fn <- c("f0","f1","f2","f3");xn <- c("x0","x1","x2","x3")
for (k in 1:4) {
plot(lr.fit,residuals=TRUE,select=k)
ff <- dat[[fn[k]]];xx <- dat[[xn[k]]]
ind <- sort.int(xx,index.return=TRUE)$ix
lines(xx[ind],(ff-mean(ff))[ind]*.33,col=2)
}
par(op)
anova(lr.fit)
lr.fit1 <- gam(y~s(x0)+s(x1)+s(x2),family=binomial,
data=dat,method="REML")
lr.fit2 <- gam(y~s(x1)+s(x2),family=binomial,
data=dat,method="REML")
AIC(lr.fit,lr.fit1,lr.fit2)
## A Gamma example, by modify `gamSim' output...
dat <- gamSim(1,n=400,dist="normal",scale=1)
dat$f <- dat$f/4 ## true linear predictor
Ey <- exp(dat$f);scale <- .5 ## mean and GLM scale parameter
## Note that `shape' and `scale' in `rgamma' are almost
## opposite terminology to that used with GLM/GAM...
dat$y <- rgamma(Ey*0,shape=1/scale,scale=Ey*scale)
bg <- gam(y~ s(x0)+ s(x1)+s(x2)+s(x3),family=Gamma(link=log),
data=dat,method="REML")
plot(bg,pages=1)
## For inverse Gaussian, see ?rig
## now a 2D smoothing example...
eg <- gamSim(2,n=500,scale=.1)
attach(eg)
op <- par(mfrow=c(2,2),mar=c(4,4,1,1))
contour(truth$x,truth$z,truth$f) ## contour truth
b4 <- gam(y~s(x,z),data=data) ## fit model
fit1 <- matrix(predict.gam(b4,pr,se=FALSE),40,40)
contour(truth$x,truth$z,fit1) ## contour fit
persp(truth$x,truth$z,truth$f) ## persp truth
vis.gam(b4) ## persp fit
detach(eg)
par(op)
##################################################
## largish dataset example with user defined knots
##################################################
par(mfrow=c(1,1))
eg <- gamSim(2,n=10000,scale=.5)
attach(eg)
ind<-sample(1:10000,1000,replace=FALSE)
b5<-gam(y~s(x,z,k=50),data=data,
knots=list(x=data$x[ind],z=data$z[ind]))
vis.gam(b5)
## and a pure "knot based" spline of the same data
b6<-gam(y~s(x,z,k=100),data=data,knots=list(x= rep((1:10-0.5)/10,10),
z=rep((1:10-0.5)/10,rep(10,10))))
vis.gam(b6,color="heat")
## varying the default large dataset behaviour via `xt'
b7 <- gam(y~s(x,z,k=50,xt=list(max.knots=1000,seed=2)),data=data)
vis.gam(b7)
detach(eg)
################################################################
## Approximate large dataset logistic regression for rare events
## based on subsampling the zeroes, and adding an offset to
## approximately allow for this.
## Doing the same thing, but upweighting the sampled zeroes
## leads to problems with smoothness selection, and CIs.
################################################################
n <- 100000 ## simulate n data
dat <- gamSim(1,n=n,dist="binary",scale=.33)
p <- binomial()$linkinv(dat$f-6) ## make 1's rare
dat$y <- rbinom(p,1,p) ## re-simulate rare response
## Now sample all the 1's but only proportion S of the 0's
S <- 0.02 ## sampling fraction of zeroes
dat <- dat[dat$y==1 | runif(n) < S,] ## sampling
## Create offset based on total sampling fraction
dat$s <- rep(log(nrow(dat)/n),nrow(dat))
lr.fit <- gam(y~s(x0,bs="cr")+s(x1,bs="cr")+s(x2,bs="cr")+s(x3,bs="cr")+
offset(s),family=binomial,data=dat,method="REML")
## plot model components with truth overlaid in red
op <- par(mfrow=c(2,2))
fn <- c("f0","f1","f2","f3");xn <- c("x0","x1","x2","x3")
for (k in 1:4) {
plot(lr.fit,select=k,scale=0)
ff <- dat[[fn[k]]];xx <- dat[[xn[k]]]
ind <- sort.int(xx,index.return=TRUE)$ix
lines(xx[ind],(ff-mean(ff))[ind]*.33,col=2)
}
par(op)
rm(dat)
Run the code above in your browser using DataLab