## read positive/negative sequence from files.
tmpfile1 = file.path(path.package("BioSeqClass"), "example", "acetylation_K.pos40.pep")
tmpfile2 = file.path(path.package("BioSeqClass"), "example", "acetylation_K.neg40.pep")
posSeq = as.matrix(read.csv(tmpfile1,header=FALSE,sep="\t",row.names=1))[,1]
negSeq = as.matrix(read.csv(tmpfile2,header=FALSE,sep="\t",row.names=1))[,1]
seq=c(posSeq,negSeq)
classLable=c(rep("+1",length(posSeq)),rep("-1",length(negSeq)) )
if(interactive()){
## test various feature coding methods.
## it may be time consuming.
fileName = tempfile()
testFeatureSet = featureEvaluate(seq, classLable, fileName, ele.type="aminoacid",
featureMethod=c("Binary", "CTD", "FragmentComposition", "GapPairComposition",
"Hydro"), cv=5, classifyMethod="libsvm",
group=c("aaH", "aaV", "aaZ", "aaP", "aaF", "aaS", "aaE"), k=3, g=7,
hydro.methods=c("kpm", "SARAH1"), hydro.indexs=c("hydroE", "hydroF", "hydroC") )
summary = read.csv(fileName,sep="\t",header=T)
fix(summary)
## Evaluate features from different feature coding functions
feature.index = 1:5
tmp <- testFeatureSet[[1]]$data
colnames(tmp) <- paste(testFeatureSet[[feature.index[1]]]$model["Feature_Function"],testFeatureSet[[feature.index[1]]]$model["Feature_Parameter"],colnames(tmp),sep=" ; ")
data <- tmp[,-ncol(tmp)]
for(i in 2:length(feature.index) ){
tmp <- testFeatureSet[[feature.index[i]]]$data
colnames(tmp) <- paste(testFeatureSet[[feature.index[i]]]$model["Feature_Function"],testFeatureSet[[feature.index[i]]]$model["Feature_Parameter"],colnames(tmp),sep=" ; ")
data <- data.frame(data, tmp[,-ncol(tmp)] )
}
name <- colnames(data)
data <- data.frame(data, tmp[,ncol(tmp)] )
## feature forward selection by 'cv_FFS_classify'
## it is very time consuming.
combineFeatureResult = fsFFS(data,stop.n=50,classifyMethod="knn",cv=5)
tmp = sapply(combineFeatureResult,function(x){c(length(x$features),x$performance["acc"])})
plot(tmp[1,],tmp[2,],xlab="featureNumber",ylab="Accuracy",main="result of FFS_KNN",pch=19)
lines(tmp[1,],tmp[2,])
## compare the prediction accuracy based on different feature coding methods and different classification models.
## it is very time consuming.
testResult = lapply(c("libsvm", "randomForest", "knn", "tree"),
function(x){
tmp = featureEvaluate(seq, classLable, fileName = tempfile(),
ele.type="aminoacid", featureMethod=c("Binary", "CTD", "FragmentComposition",
"GapPairComposition", "Hydro"), cv=5, classifyMethod=x,
group=c("aaH", "aaV", "aaZ", "aaP", "aaF", "aaS", "aaE"), k=3, g=7,
hydro.methods=c("kpm", "SARAH1"), hydro.indexs=c("hydroE", "hydroF", "hydroC") );
sapply(tmp,function(y){c(y$model[["Feature_Function"]], y$model[["Feature_Parameter"]], y$model[["Model"]], y$performance[["acc"]])})
})
tmpFeature = as.factor(c(sapply(testResult,function(x){apply(x[1:2,],2,function(y){paste(y,collapse="; ")})})))
tmpModel = as.factor(c(sapply(testResult,function(x){x[3,]})))
tmp1 = data.frame(as.integer(tmpFeature), as.integer(tmpModel), as.numeric(c(sapply(testResult,function(x){x[4,]}))) )
require(scatterplot3d)
s3d=scatterplot3d(tmp1,color=c("red","blue","green","yellow")[tmp1[,2]],pch=19,
xlab="Feature Coding", ylab="Classification Model",
zlab="Accuracy under 5-fold cross validation",lab=c(10,6,7),
y.ticklabs=c("",as.character(sort(unique(tmpModel))),"") )
}
Run the code above in your browser using DataLab