## load transcription factor binding site data
data(TFBS)
enhancerFB
## select a few samples for training - here for demonstration purpose
## normally you would use 70 or 80\% of the samples for training and
## the rest for test
## train <- sample(1:length(enhancerFB), length(enhancerFB) * 0.7)
## test <- c(1:length(enhancerFB))[-train]
train <- sample(1:length(enhancerFB), 50)
## create a kernel object for the gappy pair kernel with normalization
gappy <- gappyPairKernel(k=1, m=4)
## show details of kernel object
gappy
## run cross validation with the kernel on C-svc in LiblineaR for cost=10
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
pkg="LiblineaR", svm="C-svc", cost=10, cross=3)
## show cross validation result
cvResult(model)
## perform tive cross validation runs
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
pkg="LiblineaR", svm="C-svc", cost=10, cross=10, noCross=5)
## show cross validation result
cvResult(model)
## plot cross validation result
plot(cvResult(model))
## run Leave-One-Out cross validation
model <- kbsvm(x=enhancerFB[train], y=yFB[train], kernel=gappy,
pkg="LiblineaR", svm="C-svc", cost=10, cross=-1)
## show cross validation result
cvResult(model)
## run gouped cross validation with full data
## on coiled coil dataset
##
## In this example the groups were determined through single linkage
## clustering of sequence similarities derived from ungapped heptad-specific
## pairwise alignment of the sequences. The variable {\tt ccgroup} contains
## the pre-calculated group assignments for the individual sequences.
data(CCoil)
ccseq
head(yCC)
head(ccgroups)
gappyK1M6 <- gappyPairKernel(k=1, m=4)
## run k-fold CV without groups
model <- kbsvm(x=ccseq, y=as.numeric(yCC), kernel=gappyK1M6,
pkg="LiblineaR", svm="C-svc", cost=10, cross=3, noCross=2,
perfObjective="BACC",perfParameters=c("ACC", "BACC"))
## show result without groups
cvResult(model)
## run grouped CV
model <- kbsvm(x=ccseq, y=as.numeric(yCC), kernel=gappyK1M6,
pkg="LiblineaR", svm="C-svc", cost=10, cross=3,
noCross=2, groupBy=ccgroups, perfObjective="BACC",
perfParameters=c("ACC", "BACC"))
## show result with groups
cvResult(model)
## For grouped CV the samples in the held out fold are from a group which
## is not present in training on the other folds. The simimar CV error
## with and without groups shows that learning is not just assigning
## labels based on similarity within the groups but is focusing on features
## that are indicative for the class also in the CV without groups. For the
## GCV no information about group membership for the samples in the held
## out fold is present in the model. This example should show how GCV
## is performed. Because of package size limitations no specific dataset is
## available in this package where GCV is necessary.
Run the code above in your browser using DataLab