predictionGW: Perform prediction of cis-regulatory elements genome-wide

Description

This function perform the prediction genome-wide of the cis-regulatory elements. The function return the output of performanceSVM and a bed file with the position of enhancers and not enhancers regions.

Usage

predictionGW(training_set,data_enhancer_svm,listHM,pcClass.string="enhancer",nClass.string="not_enhancers",pcClass,ncClass,cost=100,type=0,output.file)

Arguments

training_set

training set (data.frame)

data_enhancer_svm

the signals of all histone marks along the genome

listHM

a vector with the histone marks that you want to use perform the prediction

pcClass.string

label of the first class (e.g. "enhancer")

nClass.string

label of the second class (e.g. "not_enhancers")

pcClass

number of positive class in the test set

ncClass

number of negative class in the test set

cost

parameter of svm (default=100)

type

type of kernel (default=0)

output.file

name of the bed file of output

Value

The performance of prediction and a bed file with the coordinates of genomic regions that contain the enhancers. The bed file is saved in the directory selected by the user.

Details

The ratio between the positive and negative regions usually is 1:10. However this ratio depends on you experimental design and your data. See documentation cisREfindbed, tuningParamtersCombROC, featSelectionWithKmeans.

Examples

Run this code

    library("GenomicRanges")
    library("SVM2CRMdata")

    setwd(system.file("data",package="SVM2CRMdata"))
    load("CD4_matrixInputSVMbin100window1000.rda")
    completeTABLE<-CD4_matrixInputSVMbin100window1000

    new.strings<-gsub(x=colnames(completeTABLE[,c(6:ncol(completeTABLE))]),pattern="CD4.",replacement="")
    new.strings<-gsub(new.strings,pattern=".norm.w100.bed",replacement="")
    colnames(completeTABLE)[c(6:ncol(completeTABLE))]<-new.strings

    #list_file<-grep(dir(),pattern=".sort.txt",value=TRUE)

    #train_positive<-getSignal(list_file,chr="chr1",reference="p300.distal.fromTSS.txt",win.size=500,bin.size=100,label1="enhancers")
    #train_negative<-getSignal(list_file,chr="chr1",reference="random.region.hg18.nop300.txt",win.size=500,bin.size=100,label1="not_enhancers")
    setwd(system.file("data",package="SVM2CRMdata"))
    load("train_positive.rda")
    load("train_negative.rda")
    
    training_set<-rbind(train_positive,train_negative)
    #the colnames of the training set should be the same of data_enhancer_svm
    colnames(training_set)[c(5:ncol(training_set))]<-gsub(x=gsub(x=colnames(training_set[,c(5:ncol(training_set))]),pattern="sort.txt.",replacement=""),pattern="CD4.",replacement="")


    setwd(system.file("extdata", package = "SVM2CRMdata"))
    data_level2 <- read.table(file = "GSM393946.distal.p300fromTSS.txt",sep = "\t", stringsAsFactors = FALSE)
    data_level2<-data_level2[data_level2[,1]=="chr1",]

    DB <- data_level2[, c(1:3)]
    colnames(DB)<-c("chromosome","start","end")

    label <- "p300"

    table.final.overlap<-findFeatureOverlap(query=completeTABLE,subject=DB,select="all")

    data_enhancer_svm<-createSVMinput(inputpos=table.final.overlap,inputfull=completeTABLE,label1="enhancers",label2="not_enhancers")
    colnames(data_enhancer_svm)[c(5:ncol(data_enhancer_svm))]<-gsub(gsub(x=colnames(data_enhancer_svm[,c(5:ncol(data_enhancer_svm))]),pattern="CD4.",replacement=""),pattern=".norm.w100.bed",replacement="")

    listcolnames<-c("H2AK5ac","H2AK9ac","H3K23ac","H3K27ac","H3K27me3","H3K4me1","H3K4me3")

    dftotann<-smoothInputFS(train_positive[,c(6:ncol(train_positive))],listcolnames,k=20)


    results<-featSelectionWithKmeans(dftotann,5)

    resultsFS<-results[[7]]


    resultsFSfilter<-resultsFS[which(resultsFS[,2]>median(resultsFS[,2])),]

    resultsFSfilterICRR<-resultsFSfilter[which(resultsFSfilter[,3]<0.50),]

    listHM<-resultsFSfilterICRR[,1]
    listHM<-gsub(gsub(listHM,pattern="_.",replacement=""),pattern="CD4.",replacement="")

    selectFeature<-grep(x=colnames(training_set[,c(6:ncol(training_set))]),pattern=paste(listHM,collapse="|"),value=TRUE)

    colSelect<-c("chromosome","start","end","label",selectFeature)
    training_set<-training_set[,colSelect]

    vecS <- c(2:length(listHM))
    typeSVM <- c(0, 6, 7)[1]
    costV <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000)[6]
    wlabel <- c("not_enhancer", "enhancer")
    infofile<-data.frame(a=c(paste(listHM,"signal",sep=".")))
    infofile[,1]<-gsub(gsub(x=infofile[,1],pattern="CD4.",replacement=""),pattern=".sort.bed",replacement="")
    
    tuningTAB <- tuningParametersCombROC(training_set = training_set, typeSVM = typeSVM, costV = costV,different.weight="TRUE", vecS = vecS[1],pcClass=100,ncClass=400,infofile)

    tuningTABfilter<-tuningTAB[tuningTAB$fscore<0.95,]
    #row_max_fscore<-which.max(tuningTABfilter[tuningTABfilter$nHM >2,"fscore"])
    row_max_fscore<-which.max(tuningTABfilter[,"fscore"])
    listHM_prediction<-gsub(tuningTABfilter[row_max_fscore,4],pattern="//",replacement="|")
    
    columnPR<-grep(colnames(training_set),pattern=paste(listHM_prediction,collapse="|"),value=TRUE)

    predictionGW(training_set=training_set,data_enhancer_svm=data_enhancer_svm, listHM=columnPR,pcClass.string="enhancers",nClass.string="not_enhancers",pcClass=100,ncClas=400,cost=100,type=0,"prediction_enhancers_CD4_results_cost=100_type=0")