# ========================================================================================= # R script for calculating the corration between predicted and measured expression values for a specific experiment # Author: xianjun dong # Date: Sat Jul 23 20:52:53 EDT 2011 # Usage: # Rscript forNature.correlation.comparison.R rpkc Tx-based all # ## to compare the precition across cell lines # Rscript forNature.correlation.comparison.R cp1w TSS-based all cpkw # ========================================================================================= # -------------- LOG ---------------- # 1. configure to run on cluster, and use commandArgs to accept parameters # 2. apply binning method to each sub-group of genes, e.g. LCP/HCP may have different best bins. # 3. merge TSS-based and Tx-based into one script # 4. divide histone modification into categories # 5. remove low-expressed genes (e.g. RNAseq > 0) # 6. merge replicates,e.g. "cp1w1" +"cp1w2" = cp1w # 7. change to use Anshul's normalized histone data # 8. one experiment each time # 9. select the representative transcript for each gene (e.g. the one with strongest expression signal) # 11. fix a bug: bestpseudo=pseudos[which.max(sapply(pseudos, function(x) cor(log2(x+X), log2(Y))))] --> bestpseudo=pseudos[which.max(abs(sapply(pseudos, function(x) cor(log2(x+X), log2(Y)))))] # 12. background correlation check: random shuffle the Y (not Xs) to redo the regression # 13. output studentlized residual (in the dataset file) # 14. randomization test by shuffling the Y lables # 15. check the different prediction for high/low expressed genes (for lm model only) # 16. check the different prediction for randomly sampled genes (for lm model only) # 17. fix a bug: change to calculate individualR by testset, rather than trainset # 18. change to use 81 bins globally # 19. fix Nhek/Nhek26 issue in Tx file (2012.03.25) # 20. add "all but no repressive" category (2012.03.25) # -------------- LOG ---------------- # =============================================================== # configure the script # =============================================================== source("mylib.R") # install these lib if it's not in the environment #install.packages(c('relaimpo','earth', 'randomForest','e1071', 'caTools')) args <- commandArgs(TRUE) experiment=args[1] # expressionDatatype=args[2] # 'TSS-based' or 'Tx-based' CpG=args[3] experiment2=args[4] # cpkc vs. cpgc if(is.na(experiment2)) experiment2=NULL # for debug #source("mylib.R"); experiment='rpmc'; CpG='all'; expressionDatatype='Tx-based'; experiment2='rp1c' #source("mylib.R"); experiment='cp1w'; CpG='all'; expressionDatatype='TSS-based'; experiment2=NULL # decode the brief code of experiment (see mylib.R for more detail) tech=strsplit(label.tr(experiment), split="\\.")[[1]][1] extract=strsplit(label.tr(experiment), split="\\.")[[1]][2] cellline=strsplit(label.tr(experiment), split="\\.")[[1]][3] if(!is.null(experiment2)) cellline2=strsplit(label.tr(experiment2), split="\\.")[[1]][3] compartment=strsplit(label.tr(experiment), split="\\.")[[1]][4] # comment/edit this if you want to run on other cell lines if(!grepl("K562|Gm12878|H1hesc|Helas3|Hepg2|Huvec|Nhek", cellline)){print(paste("cellline ignored:", cellline));quit('no');} PSEUDOCOUNT=0.03 nfold=10 bM='bestbin' LOW_SIGNAL_CUTOFF = log2(0.03) transformation='log' BINFILE_FOLDER = "./data/Jan11.Anshul.all.81bins.level12.L4100.HisoneDnase"; # folder path for binned density for all chromatin features genefile="./data/gencode_v7_hg19_transcripts.gtf.cpg.tab" # gene annotation pathname="./result/"; # path to store output files genes=read.table(genefile, header = T) genes=genes[(genes$end-genes$start)>4100 & genes$level<3, ] if (file.exists(pathname) == FALSE) dir.create(pathname, showWarnings=F, recursive=T) dir.create(paste(pathname, "data", sep="/"), showWarnings=F, recursive=T) dir.create(paste(pathname, "data/activeTSS", sep="/"), showWarnings=F, recursive=T) dir.create(paste(pathname, "figures/pdf", sep="/"), showWarnings=F, recursive=T) dir.create(paste(pathname, "figures/png", sep="/"), showWarnings=F, recursive=T) dir.create(paste(pathname, "figures/fig", sep="/"), showWarnings=F, recursive=T) # =============================================================== # processing the input data # =============================================================== DATA = paste("./data", paste(expressionDatatype,"RData", sep="."), sep="/") if(file.exists(DATA)){ load(DATA) } else { # -------------- read expression data (Y) ----------------- if(expressionDatatype == 'TSS-based') { data01 <- read.table("./data/Gencodev7_TSS_July2011.gff") RNAseq=data01[,c(1,2,4,5,7, seq(10, ncol(data01), 2))] names(RNAseq)=c("chr","source","TSS","TTS", "strand",as.vector(t(data01[1,seq(9, ncol(data01), 2)]))) # TTS is actually TSS for TSS-based files RNAseq=RNAseq[,c(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq), invert=T)], sort(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq))]))] RNAseq=RNAseq[RNAseq$confidence!='low',] #remove 'low' confidence ones (~5000 removed) RNAseq=unique(RNAseq[,grep("source|confidence", names(RNAseq), invert=T)]) # [Optional] remove columan 'source' and 'confidence', and take the uniq TSS ones. (another 4000 removed) # TSS ID (e.g. geneid.start) NB: This should be transID for Tx-based data; rownames(RNAseq)=do.call("paste",c(RNAseq[c('gene_id','TSS')], sep=".")) y=unique(as.data.frame(cbind(TSS_id=paste(genes$gene_id, ifelse(genes$strand=='-', genes$end, genes$start), sep="."), normalizedCpG=genes$normalizedCpG))) # TSS ID (e.g. geneid.start) NB: This should be transID for Tx-based data; RNAseq=cbind(RNAseq[match(intersect(rownames(RNAseq), y$TSS_id),rownames(RNAseq)),], normalizedCpG=y[match(intersect(rownames(RNAseq), y$TSS_id),y$TSS_id),2]) RNAseq$normalizedCpG=as.numeric(as.character(RNAseq$normalizedCpG)) # de-factor # TILL NOW, there are 51341 lines in data, unique TSS with level=1/2 (and medium/high in Sarah's file) and length>4000 } if(expressionDatatype == 'Tx-based') { data01 <- read.table("./data/gencode_v7_hg19_tr_with115_cshl_long_quantif.gff") RNAseq=data01[,c(1,2,4,5,7, seq(10, ncol(data01), 2))] names(RNAseq)=c("chr","source","TSS","TTS", "strand",as.vector(t(data01[1,seq(9, ncol(data01), 2)]))) # TTS is actually TSS for TSS-based files RNAseq=RNAseq[,c(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq), invert=T)], sort(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq))]))] rownames(RNAseq)=RNAseq[,'transcript_id'] RNAseq=cbind(RNAseq[match(intersect(rownames(RNAseq), genes$trans_id),rownames(RNAseq)),], normalizedCpG=genes[match(intersect(rownames(RNAseq), genes$trans_id),genes$trans_id),'normalizedCpG']) RNAseq$normalizedCpG=as.numeric(as.character(RNAseq$normalizedCpG)) # de-factor # TILL NOW, there are 61177 unique transcripts in data, with level=1/2 and length>4000 } # merge replicates uniqID=unique(sort(gsub("[0-9]$", "", colnames(RNAseq)[grep("^[cdr].{3}[0-9]", colnames(RNAseq))]))) RNAseq_merged=RNAseq[, grep("^[cdr].{3}[0-9]", colnames(RNAseq), invert=T)] RNAseq_replicates=c(); for(expr in uniqID){ if(length(grep(expr, colnames(RNAseq)))==1) RNAseq_merged = cbind(RNAseq_merged, RNAseq[,grep(expr, colnames(RNAseq))]) else RNAseq_merged = cbind(RNAseq_merged, apply(RNAseq[,grep(expr, colnames(RNAseq))], 1, mean)) if(length(grep(expr, colnames(RNAseq)))==2) { if(is.null(RNAseq_replicates)) RNAseq_replicates=RNAseq[,grep(expr, colnames(RNAseq))] else RNAseq_replicates=cbind(RNAseq_replicates,RNAseq[,grep(expr, colnames(RNAseq))]) } } colnames(RNAseq_merged) = c(colnames(RNAseq)[grep("^[cdr].{3}[0-9]", colnames(RNAseq), invert=T)], uniqID) RNAseq=RNAseq_merged save(RNAseq, RNAseq_replicates, file=DATA) } ## only genes >0 expression in at least one cell line (e.g. remove genes which are 0 in all cell lines), for the same Tech/RNAextract/Compartment #exprt=unlist(strsplit(experiment, "")) #exprt[3]='.*' #cpcindex=grep(paste(exprt, collapse=""), colnames(RNAseq)) #cpc=RNAseq[, cpcindex] ##all0=ifelse(is.null(nrow(cpc)), cpc==0, apply(cpc>0, 1, sum)==0) #if(is.null(nrow(cpc))) index=cpc>0 else index=apply(cpc>0, 1, sum)>0; #if(is.null(nrow(cpc))) all0=cpc==0 else all0=apply(cpc>0, 1, sum)==0; #if(is.null(nrow(cpc))) cpcnames=experiment else cpcnames=colnames(cpc) # ## active TSS (for Chao/Ewan) #exprt[3]="_" #write.table(RNAseq[index,c(1:9,cpcindex)], paste(pathname, "data/activeTSS",paste(ifelse(grepl("TSS", expressionDatatype),"activeTSS", "activeTx"), "in", paste(exprt, collapse=""),"tab", sep="."), sep="/"), quote=F, sep="\t", row.names=F, col.names=T) # remove those TSS with 0 in all RNAseq experiments index=apply(RNAseq[, grep("^r.{3}", colnames(RNAseq))]>0, 1, sum)>0 TSSsubset=paste(pathname, "data/activeTSS", paste(expressionDatatype, "with_at_least_one_active_RNAseq_experiments", "tab", sep="."), sep="/") if (file.exists(TSSsubset) == FALSE) write.table(RNAseq[index,], TSSsubset, quote=F, sep="\t", row.names = F) ## zero vs. non-zero #X=apply(RNAseq[, grep("^[crd].{3}", colnames(RNAseq))], 2, function(x) table(x==0)/nrow(RNAseq)) ##X=X[,order(X[1,])] #png("zero.vs.nonzero.png", width=1500, height=1500); #par(xpd=T, mar=c(5,4,4,4), mfrow=c(3,1)); #barplot(X, las=2, space=0.1, width=1, col=c('black','gray'), main="all TSS", cex.name=1.2); #legend(1.1*ncol(X)+0.1,1,c(">0","=0"), col=c('black','gray'), pch=15, cex=1.5); # #index=apply(RNAseq[, grep("^c.{3}", colnames(RNAseq))]>0, 1, sum)>0 #X1=apply(RNAseq[index, grep("^[crd].{3}", colnames(RNAseq))], 2, function(x) table(x==0)/sum(index)); #X=X1[,colnames(X)] #barplot(X, las=2, space=0.1, width=1, col=c('black','gray'), main="exclude TSS with zero in all CAGE", cex.name=1.2); # #index=apply(RNAseq[, grep("^r.{3}", colnames(RNAseq))]>0, 1, sum)>0 #X1=apply(RNAseq[index, grep("^[crd].{3}", colnames(RNAseq))], 2, function(x) table(x==0)/sum(index)); #X=X1[,colnames(X)] #barplot(X, las=2, space=0.1, width=1, col=c('black','gray'), main="exclude TSS with zero in all RNAseq", cex.name=1.2); # #dev.off() # [optional] histogram for expression and CpG score Y=RNAseq[,experiment] png(paste(pathname, "figures/fig",paste("expression.hist", experiment,"png", sep="."), sep="/"), width=1200, height=800) par(mfrow=c(2,3)) hist(log2(Y+0.001), main=paste(label.tr(experiment), "\n(all TSS)"), xlab = "log2(expression)", cex.main=2) legend("topright",paste(round(100*sum(Y==0)/length(Y),1), "% zero", sep=""), cex=1.5, bty='n') h=hist(log2(Y[Y>0]+0.001), plot=F) barplot(h$counts, names.arg=h$mids, space=0.1, cex.lab=1.5, width=1, col=8-6*localMin(h$counts), xlab="log2(expression>0)", ylab="counts",cex.main=2, main=paste(label.tr(experiment),"\n(active TSS)")) x=tail(which(localMin(h$counts))) if(length(x)>0) text(x=x,y=h$counts[x], label=format(h$mids[x]), po=3, cex=0.7, col='red') #hist(log2(Y[index]+0.001), xlab="log2(expression>0 in at least one cellline)", ylab="counts", cex.lab=1.5, cex.main=2, main=paste("active TSS in at least one cell line", paste(cellline.tr(sapply(strsplit(cpcnames,""), function(x) x[3])), collapse=","), sep="\n")) hist(log2(Y[index]+0.001), xlab="log2(expression>0 in at least one cellline)", ylab="counts", cex.lab=1.5, cex.main=2, main="active TSS in at least one RNAseq experiments") legend("topright",paste(round(100*sum(Y[index]==0)/length(Y[index]),1), "% zero", sep=""), cex=1.5, bty='n') normalizedCpG=RNAseq[,'normalizedCpG'] hist(normalizedCpG, freq = T, cex.lab=1.5) hist(normalizedCpG[Y>0], freq = T, cex.lab=1.5,xlab="normalizedCpG(expression>0)") hist(normalizedCpG[index], freq = T, cex.lab=1.5,xlab="normalizedCpG(expression>0 in at least one cellline)") dev.off() RNAseq=RNAseq[index, ] if(CpG=='HCP'){ RNAseq = RNAseq[RNAseq$normalizedCpG > 0.4, ] } if(CpG=='LCP'){ RNAseq = RNAseq[RNAseq$normalizedCpG <= 0.4, ] } # add gene_name, gene_type RNAseq=cbind(RNAseq, genes[match(RNAseq$gene_id, genes$gene_id), c('gene_type','gene_name')]) ## select the representative transcript for each gene (e.g. the one with strongest expression signal) RNAseqA=RNAseq[order(RNAseq$gene_id, -RNAseq[[experiment]]), c(experiment, 'gene_id', 'gene_type','gene_name', 'normalizedCpG')] RNAseqA=RNAseqA[match(unique(RNAseqA$gene_id), RNAseqA$gene_id), ] # for rpkc experiment, 21414/89995 retained ### divide the gene set into training+prediction sets (for pseudocount optimization) random=sample(nrow(RNAseqA), round(nrow(RNAseqA)/3)) # =============================================================== # get signal for each factor (X) # =============================================================== filenames = paste(BINFILE_FOLDER, dir(BINFILE_FOLDER), sep="/") # this part is for cross-cell line study. if(!is.null(experiment2)){ ## select the representative transcript for each gene (e.g. the one with strongest expression signal) RNAseqB=RNAseq[order(RNAseq$gene_id, -RNAseq[[experiment2]]), c(experiment2, 'gene_id', 'gene_type','gene_name', 'normalizedCpG')] RNAseqB=RNAseqB[match(unique(RNAseqB$gene_id), RNAseqB$gene_id), ] # for rpkc experiment, 21414/89995 retained RNAseq1=RNAseqB[random, ] RNAseq2=RNAseqB[-random, ] dataX=c() for (fname in filenames[grep(cellline2, filenames)]) { hismark = sub(".*\\.(.*)", "\\1", fname) # e.g. H3k4me1 if(!grepl("^H|Dnase|Control", hismark, ignore.case=T)) next # no Pol2 and CTCF (only Histone and Control) # read each histone modification signals tab5rows <- read.table(fname, nrows = 200) classes <- sapply(tab5rows, class) histdata=read.table(fname, comment.char ="", colClasses = classes, nrows=89418) if(expressionDatatype=='TSS-based') { ## ---for TSS-based data histdata = histdata[,c(1:44)] # take maximally till 41bins #histdata = unique(histdata[,c(-2, -3)]) # slow histdata=histdata[match(unique(histdata[,1]), histdata[,1]),c(-2, -3)] ## using geneID.start as ID histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx40 } if(expressionDatatype=='Tx-based'){ ## ---for Tx-based data # take all 81 bins histdata=histdata[match(unique(histdata[,2]), histdata[,2]),c(-1, -3)] histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx80 } ptm <- proc.time() # intersection: subset of RNAseq (unique tssid) -- for training set itsid=intersect(rownames(RNAseq1), rownames(histdata)) RNAseq0 = RNAseq1[match(itsid, rownames(RNAseq1)), experiment2] + PSEUDOCOUNT # RNAseq0 is a vector now histdata0 = histdata[match(itsid, rownames(histdata)), ] # get optimized pseudocounts based on RNAseq1 pseudocount = apply(histdata0, 2, function(x) getBestPseudocount(x, RNAseq0)) # training set for classification RNAseq_training=log2(RNAseq0) normalizedCpG_training=RNAseq1[match(itsid, rownames(RNAseq1)), 'normalizedCpG'] histone_training=log2(t(apply(histdata0, 1, function(x) x+pseudocount))) # apply pseudocount to the remain 2/3 dataset, and do log2 transformation on histone signal and RNAseq itsid=intersect(rownames(RNAseq2), rownames(histdata)) RNAseq0 = log2(RNAseq2[match(itsid, rownames(RNAseq2)), experiment2] + PSEUDOCOUNT) # RNAseq0 is a vector now, actual RNAseq for further analysis histdata0 = log2(t(apply(histdata[match(itsid, rownames(histdata)), ], 1, function(x) x+pseudocount))) normalizedCpG0 = RNAseq2[match(itsid, rownames(RNAseq2)), 'normalizedCpG'] print(paste(fname, CpG, bM, nrow(histdata0), length(RNAseq0),(proc.time() - ptm)[[3]])); # get the mean density of bins selected by method of 'binning method' mean.density = getBestBinsMean(binningMethod='bestbin', Xs=histdata0, y=RNAseq0, nfold=nfold) # mean.density is a vector. mean.density = data.frame(mean.density, row.names=itsid) # re-order rows by the original order. Necessary? colnames(mean.density) = hismark if(is.null(dataX)) {dataX=mean.density;} else {dataX=cbind(dataX, mean.density);} #fname = filenames[grep(cellline, filenames)][ } if(is.null(dataX)) {print(paste("no data found for cellline", cellline));quit('no');} dataX=cbind(normalizedCpG=normalizedCpG0, dataX) dataX2=dataX RNAseq02=RNAseq0 } RNAseq1=RNAseqA[random, ] RNAseq2=RNAseqA[-random, ] dataX=c() for (fname in filenames[grep(cellline, filenames)]) { hismark = sub(".*\\.(.*)", "\\1", fname) # e.g. H3k4me1 if(!grepl("^H|Dnase|Control", hismark, ignore.case=T)) next # no Pol2 and CTCF (only Histone and Control) # read each histone modification signals tab5rows <- read.table(fname, nrows = 200) classes <- sapply(tab5rows, class) histdata=read.table(fname, comment.char ="", colClasses = classes, nrows=89418) if(expressionDatatype=='TSS-based') { ## ---for TSS-based data histdata = histdata[,c(1:44)] # take maximally till 41bins #histdata = unique(histdata[,c(-2, -3)]) # slow histdata=histdata[match(unique(histdata[,1]), histdata[,1]),c(-2, -3)] ## using geneID.start as ID histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx40 } if(expressionDatatype=='Tx-based'){ ## ---for Tx-based data # take all 81 bins histdata=histdata[match(unique(histdata[,2]), histdata[,2]),c(-1, -3)] histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx80 } ptm <- proc.time() # intersection: subset of RNAseq (unique tssid) -- for training set itsid=intersect(rownames(RNAseq1), rownames(histdata)) RNAseq0 = RNAseq1[match(itsid, rownames(RNAseq1)), experiment] + PSEUDOCOUNT # RNAseq0 is a vector now histdata0 = histdata[match(itsid, rownames(histdata)), ] # get optimized pseudocounts based on RNAseq1 pseudocount = apply(histdata0, 2, function(x) getBestPseudocount(x, RNAseq0)) # training set for classification RNAseq_training=log2(RNAseq0) normalizedCpG_training=RNAseq1[match(itsid, rownames(RNAseq1)), 'normalizedCpG'] histone_training=log2(t(apply(histdata0, 1, function(x) x+pseudocount))) # [optional] compare the individual R for best pseudocount and fixed pseudocount pdf(paste(pathname,"figures/fig",paste("individualR", expressionDatatype, experiment, hismark, CpG, bM, "pdf", sep="."), sep="/"), width=12, height=6) par(mar=c(4, 5, 4, 5) + 0.1) individualR1=cor(histone_training, RNAseq_training) individualR2=cor(log2(t(apply(histdata0, 1, function(x) x+0.001))), RNAseq_training) plot(individualR2, type='b', col='gray', ylim=range(c(individualR1, individualR2)), xlab="bins", ylab="individual R", main=paste("Comparison of the individual R for different pseudocount", paste(expressionDatatype, experiment,hismark, CpG, bM, sep="."),sep="\n")) lines(individualR1, type='b', pch=19) individualSignal=apply(histdata0, 2, mean) abline(v=c(which.max(abs(individualR1)), which.max(abs(individualSignal))), lty=2, col=c('black', 'blue')) #individualSignal = (individualSignal-min(individualSignal))*(max(individualR1, individualR2) - min(individualR1, individualR2))/(max(individualSignal)-min(individualSignal)) + min(individualR1, individualR2) # draw the other Y axis (Tips: http://onertipaday.blogspot.com/2007/05/how-to-draw-plot-with-two-y-axises-and.html) par(usr=c(par("usr")[1:2], min(individualSignal)-0.1*(max(individualSignal)-min(individualSignal)), max(individualSignal)+0.1*(max(individualSignal)-min(individualSignal)))) lines(individualSignal, type='b', pch=20, col='blue') axis(4, col.axis='blue') # manually label the other Y axis mtext(side=4, line=2, 'mean signal', col='blue') legend('topleft', c("optimized pseudocount", "fixed pseudocount (0.001)", "mean signal"), pch=c(19, 1, 20), col=c('black', 'gray', 'blue'), lty=1) dev.off() # apply pseudocount to the remain 2/3 dataset, and do log2 transformation on histone signal and RNAseq itsid=intersect(rownames(RNAseq2), rownames(histdata)) RNAseq0 = log2(RNAseq2[match(itsid, rownames(RNAseq2)), experiment] + PSEUDOCOUNT) # RNAseq0 is a vector now, actual RNAseq for further analysis histdata0 = log2(t(apply(histdata[match(itsid, rownames(histdata)), ], 1, function(x) x+pseudocount))) normalizedCpG0 = RNAseq2[match(itsid, rownames(RNAseq2)), 'normalizedCpG'] print(paste(fname, CpG, bM, nrow(histdata0), length(RNAseq0),(proc.time() - ptm)[[3]])); # get the mean density of bins selected by method of 'binning method' mean.density = getBestBinsMean(binningMethod='bestbin', Xs=histdata0, y=RNAseq0, nfold=nfold) # mean.density is a vector. mean.density = data.frame(mean.density, row.names=itsid) # re-order rows by the original order. Necessary? colnames(mean.density) = hismark if(is.null(dataX)) {dataX=mean.density;} else {dataX=cbind(dataX, mean.density);} #fname = filenames[grep(cellline, filenames)][ } if(is.null(dataX)) {print(paste("no data found for cellline", cellline));quit('no');} # add normalizedCpG dataX=cbind(normalizedCpG=normalizedCpG0, dataX) print("Chromatin feature construction done!") ########## clustering #x=dataX #x=cbind(cl=kmeans(x, 4)$cluster, expression=RNAseq0, x) #x=x[order(x$cl),] #annotation=data.frame(expression=x[,ncol(x)], row.names=rownames(x)) #ann_colors = list(expression=c("lightgreen", "navy")) #png(paste(pathname,"figures/fig",paste("clustering", expressionDatatype, experiment,'png', sep="."), sep="/"), width=1200, height=300); #library(pheatmap) #pheatmap(t(x[,c(-1, -2)]),cluster_cols=F, cluster_rows=T,border_color=NA, show_colnames=F, annotation=annotation, annotation_colors=ann_colors); #dev.off(); # ================================================== # regression model # ================================================== TXTfile =paste(pathname, paste(expressionDatatype, experiment, CpG, bM, nfold, "results.txt", sep="."), sep="") HTMLfile=paste(pathname, paste(expressionDatatype, experiment, CpG, bM, nfold, "results.html", sep="."), sep="") if(file.exists(TXTfile)) file.remove(TXTfile) if(file.exists(HTMLfile)) file.remove(HTMLfile) for(filtermark in c(0:9)) { # 8: DnaseI + histones; if(filtermark==0) filter_regexp="" # 0: all hisone marks | Control if(filtermark==1) filter_regexp="^H|Control" # 1: Just promoter marks; if(filtermark==2) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac" # 2: Just structural marks; if(filtermark==3) filter_regexp="H3k36me3|H3k79me2" #3: Just "Other/Distal" marks; if(filtermark==4) filter_regexp="H3k4me1|H4k20me1|H3k9me1" # 8: only repressive mark if(filtermark==5) filter_regexp="H3k27me3|H3k9me3" # 4: Promoter+Structural marks; if(filtermark==6) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k36me3|H3k79me2" # 5: Promoter+Repressive marks; if(filtermark==7) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k27me3|H3k9me3" # 6: Promoter+Other marks; if(filtermark==8) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k4me1|H4k20me1|H3k9me1" # 9: all marks but not repressive marks; if(filtermark==9) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k4me1|H4k20me1|H3k9me1|H3k36me3|H3k79me2|Control" index=grep(filter_regexp, colnames(dataX)) if(length(index)<1) next; # to print? datax = dataX[, index, drop=F] # for cross-cellline comparison datax2=datax if(!is.null(experiment2)){ index2=grep(filter_regexp, colnames(dataX2)) if(length(index2)<1) next; datax2 = dataX2[, index2, drop=F] } for(model in c('lm', 'mars', 'randomForest')) { print(paste("filtermark",filtermark, "; regressionModel:", model, "; nrow(datax):", nrow(datax), sep="")) DATAfile=paste(pathname, "data", paste(expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "tab", sep="."), sep="/") if(file.exists(DATAfile)) file.remove(DATAfile) # ================================================== # classification & regression model # ================================================== if(is.null(experiment2)) out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=nfold, shuffleY=F, shuffleX=F, swapX=F, zero=LOW_SIGNAL_CUTOFF) else out = do_classification_regression_crossCellline(Xs=datax2, Y=RNAseq02, Xb=datax, Yb=RNAseq0, model_C='randomForest', model_R=model, nfold=nfold, zero=LOW_SIGNAL_CUTOFF) # use model trained on Xb/Yb to predict for dataset Xs/Y # ================================================== # measurement for overall prediction, classification and regression # ================================================== measured = out[[1]]; predicted = out[[2]] topX_C = out[[3]]; topX_R = out[[4]] predicted_C = out[[5]]; predicted_R = out[[6]]; relImp_C_metrics=out[[7]]; relImp_R_metrics=out[[8]] measured_R_NZ = out[[9]]; predicted_R_NZ = out[[10]] R = cor(measured, predicted) pvalue = cor.test(measured, predicted, alternative = "greater")$p.value RMSE=rmse(measured, predicted) #NRMSE = RMSE / (max(measured) - min(measured)) #normalized RMSE NRMSE = 1 - (RMSE^2)/mean((measured-mean(measured))^2) individualR = cor(datax, rankit(RNAseq0))[,1] cols=rep("#0000ff22", length(predicted)) #cols[predicted==log2(0.03)]="#00000011" # mark all0 cases as red #cols[match(intersect(rownames(datax), names(all0)[all0]), rownames(datax))]="#ff000022" info=paste("Pearson's r =", format.value(R), "(p-value:", format.value(pvalue) , ")\nRMSE =", format.value(RMSE), "(R^2=", format.value(NRMSE), ")") # measurement for classification and regression library(caTools, verbose = FALSE) diagnostics.c.url=paste("diagnostics.classifier", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "png", sep=".") png(paste(pathname,"figures/fig",diagnostics.c.url, sep="/"), width=600, height=600) AUC=colAUC(measured, ifelse(predicted_C>=0.5, 1, 0), plotROC=T)[1] # AUC for the classification dev.off() diagnostics.c.url=paste(", C", sep="") # standard regr ession diagnostics (4-up) -- ONLY FOR REGRESSION rR=cor(measured_R_NZ, predicted_R_NZ); #pR=format.value(cor.test(measured, out[[6]], alternative = "greater")$p.value) rmseR=rmse(measured_R_NZ, predicted_R_NZ) nrmseR= 1 - (rmseR^2)/mean((measured_R_NZ-mean(measured_R_NZ))^2) diagnostics.r.url=paste("diagnostics.regressor", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "png", sep="."); png(paste(pathname,"figures/fig",diagnostics.r.url, sep="/"), width=800, height=800) oldpar <- par(mfrow = c(2, 2)) plot(lm(measured_R_NZ~predicted_R_NZ)) par(oldpar) dev.off() diagnostics.r.url=paste(", R", sep="") otherinfo =paste("Classification: AUC=", round(AUC, 2), "\nRegression: r=", round(rR, 2),"(RMSE =", round(rmseR, 2), "; R^2 =",round(nrmseR, 2),")") # ================================================== # randomization analysis # ================================================== randomization_info="" if(is.null(experiment2)) { # Randomization1: shuffled Y (not Xs) out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=1, shuffleY=T, shuffleX=F, swapX=F, zero=LOW_SIGNAL_CUTOFF) R_randomization=round(cor(out[[1]], out[[2]]), 2); RMSE_randomization=round(rmse(out[[1]], out[[2]]), 2); types_randomization="Shuffle Y" # Randomization2: shuffled Xs independaly (not Y) out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=1, shuffleY=F, shuffleX=T, swapX=F, zero=LOW_SIGNAL_CUTOFF) R_randomization=c(R_randomization, round(cor(out[[1]], out[[2]]), 2)); RMSE_randomization=c(RMSE_randomization, round(rmse(out[[1]], out[[2]]), 2)); types_randomization=c(types_randomization, "Shuffle Xs") # Randomization3: swap X lables out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=1, shuffleY=F, shuffleX=F, swapX=T, zero=LOW_SIGNAL_CUTOFF) R_randomization=c(R_randomization, round(cor(out[[1]], out[[2]]), 2)); RMSE_randomization=c(RMSE_randomization, round(rmse(out[[1]], out[[2]]), 2)); types_randomization=c(types_randomization, "Swap Xs lables") randomization_info=paste("Pearson's r (RMSE) of randomization tests","----------------------------------",paste(types_randomization,": ",R_randomization," (",RMSE_randomization,")", sep="",collapse="\n"), sep="\n") } # ================================================== # draw figures: scatterplot, variable importance # ================================================== # PNG version of transparent version main = paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n") if(!is.null(experiment2)) main=paste(paste(cellline,"-->",cellline2,"(", experiment,")", sep=""), paste(expressionDatatype, CpG, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n") if(!is.null(experiment2)) filename = paste("crossCellline", expressionDatatype, experiment, "-", experiment2, CpG, bM, model, filtermark, nfold, sep=".") else filename = paste(expressionDatatype, label.tr(experiment), CpG, bM, model, filtermark, nfold, sep=".") draw.figure(mear=measured, pred=predicted, topX_C=topX_C, topX_R=topX_R, relImp_C_metrics, relImp_R_metrics, dotColor=cols, info=info, otherinfo=otherinfo, randomization_info=randomization_info, main=main, path=paste(pathname,"figures/png", sep="/"),filename=filename,filetype="png", smoothScatter=F) # PDF version #source("mylib.R") draw.figure(mear=measured, pred=predicted, topX_C=topX_C, topX_R=topX_R, relImp_C_metrics, relImp_R_metrics, dotColor=cols, info=info, otherinfo=otherinfo, randomization_info=randomization_info, main=main, path=paste(pathname,"figures/pdf", sep="/"),filename=filename,filetype="pdf", smoothScatter=F) # ================================================== # binned by expression values (only for lm model) # ================================================== binimgurl=""; if(model=='lm' & filtermark==0) { relimp.bin.by.expression(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2, main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"), filename=paste(pathname,"figures/fig",paste("relimp_by_expressionBins", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, 'low2high', "png", sep="."), sep="/"), decreasing=F) binimgurl=paste("
S1", sep="") relimp.bin.by.expression(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2, main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"), filename=paste(pathname,"figures/fig",paste("relimp_by_expressionBins", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, 'high2low', "png", sep="."), sep="/"), decreasing=T) binimgurl=paste(binimgurl, ", S2", sep="") relimp.bin.by.expression(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2, main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"), filename=paste(pathname,"figures/fig",paste("relimp_by_expressionBins", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, 'bined', "pdf", sep="."), sep="/"), accumulative=F) binimgurl=paste(binimgurl, ", S3", sep="") relimp.randomSampling(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2, main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"), filename=paste(pathname,"figures/fig",paste("relimp_randomSampling", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "png", sep="."), sep="/")) binimgurl=paste(binimgurl, ", R", sep="") } # ================================================== ## save result into HTML/TXT # ================================================== topX_R=topX_R[order(-topX_R)] # sort in increasing order topX_C=topX_C[names(topX_R)] # sort in order of topX_R individualR=individualR[names(topX_R)] # sort in order of topX_R res=c(expressionDatatype, experiment, tech, extract, cellline, compartment, CpG, bM, model, filtermark, format.value(R, 'table'), format.value(pvalue, 'table'), format.value(RMSE, 'table'), format.value(NRMSE, 'table'), format.value(AUC, 'table'), format.value(rR, 'table'), format.value(rmseR, 'table'), paste(names(topX_R),sep="",collapse=";"), paste(round(topX_C, 3),sep="",collapse=";"), paste(round(topX_R, 3),sep="",collapse=";"), paste(round(individualR, 2),sep="",collapse=";")) write.table(t(res), file=TXTfile, append =T, quote=F, sep="\t", row.names=F, col.names=F) # write data.frame to text txturl="" if(nfold>1){ dataset=data.frame(RNAseq2[match(rownames(datax), rownames(RNAseq2)),c('gene_id','gene_type','gene_name')], datax, RNAseq0, measured=measured, predicted=predicted, predicted_C=predicted_C, predicted_R=predicted_R) # be aware of the same gene order between columns colnames(dataset) = c('gene_id','gene_type','gene_name', colnames(datax), experiment, "measured_log2", "predicted_log2", "predicted_C", "predicted_R"); write.table(format(dataset, digits = 4, nsmall=3, scientific=F, trim=T, drop0trailing=T), file=DATAfile, quote=F, sep="\t", row.names=F, col.names=T) txturl = paste(", dataset", sep="") } # write to HTML imgurl = paste("figures/png", paste(expressionDatatype, label.tr(experiment), CpG, bM, model, filtermark, nfold, "png", sep="."), sep="/") pdfurl = paste("figures/pdf", paste(expressionDatatype, label.tr(experiment), CpG, bM, model, filtermark, nfold, "pdf", sep="."), sep="/") imghtml=""; individualRhtml=paste(names(individualR),sep="",collapse="
") histurl="" if(filtermark==0) { imghtml = paste("
Click for larger image

", sep="") individualRhtml=paste(paste("", names(individualR),"", sep=""), sep="",collapse="
") histurl = paste(", H", sep="") } cat("\n", file=HTMLfile, append=T) cat(expressionDatatype, experiment, tech, extract, cellline, compartment, CpG, bM, model, filtermark, round(R, 2), round(RMSE,2 ), round(NRMSE,2), round(AUC, 2), round(rR,2), round(rmseR,2), round(nrmseR,2), individualRhtml, paste(round(topX_C, 3),sep="",collapse="
"), paste(round(topX_R, 3),sep="",collapse="
"), paste(round(individualR, 2),sep="",collapse="
"), paste(imghtml, "Download: PDF, PNG", txturl, histurl, binimgurl, diagnostics.c.url, diagnostics.r.url, sep=""), sep="\n", file=HTMLfile, append=T) cat("\n\n", file=HTMLfile, append=T) } }