# =========================================================================================
# R script for calculating the corration between predicted and measured expression values for a specific experiment
# Author: xianjun dong
# Date: Sat Jul 23 20:52:53 EDT 2011
# Usage:
# Rscript forNature.correlation.comparison.R rpkc Tx-based all
# ## to compare the precition across cell lines
# Rscript forNature.correlation.comparison.R cp1w TSS-based all cpkw
# =========================================================================================
# -------------- LOG ----------------
# 1. configure to run on cluster, and use commandArgs to accept parameters
# 2. apply binning method to each sub-group of genes, e.g. LCP/HCP may have different best bins.
# 3. merge TSS-based and Tx-based into one script
# 4. divide histone modification into categories
# 5. remove low-expressed genes (e.g. RNAseq > 0)
# 6. merge replicates,e.g. "cp1w1" +"cp1w2" = cp1w
# 7. change to use Anshul's normalized histone data
# 8. one experiment each time
# 9. select the representative transcript for each gene (e.g. the one with strongest expression signal)
# 11. fix a bug: bestpseudo=pseudos[which.max(sapply(pseudos, function(x) cor(log2(x+X), log2(Y))))] --> bestpseudo=pseudos[which.max(abs(sapply(pseudos, function(x) cor(log2(x+X), log2(Y)))))]
# 12. background correlation check: random shuffle the Y (not Xs) to redo the regression
# 13. output studentlized residual (in the dataset file)
# 14. randomization test by shuffling the Y lables
# 15. check the different prediction for high/low expressed genes (for lm model only)
# 16. check the different prediction for randomly sampled genes (for lm model only)
# 17. fix a bug: change to calculate individualR by testset, rather than trainset
# 18. change to use 81 bins globally
# 19. fix Nhek/Nhek26 issue in Tx file (2012.03.25)
# 20. add "all but no repressive" category (2012.03.25)
# -------------- LOG ----------------
# ===============================================================
# configure the script
# ===============================================================
source("mylib.R")
# install these lib if it's not in the environment
#install.packages(c('relaimpo','earth', 'randomForest','e1071', 'caTools'))
args <- commandArgs(TRUE)
experiment=args[1] #
expressionDatatype=args[2] # 'TSS-based' or 'Tx-based'
CpG=args[3]
experiment2=args[4] # cpkc vs. cpgc
if(is.na(experiment2)) experiment2=NULL
# for debug
#source("mylib.R"); experiment='rpmc'; CpG='all'; expressionDatatype='Tx-based'; experiment2='rp1c'
#source("mylib.R"); experiment='cp1w'; CpG='all'; expressionDatatype='TSS-based'; experiment2=NULL
# decode the brief code of experiment (see mylib.R for more detail)
tech=strsplit(label.tr(experiment), split="\\.")[[1]][1]
extract=strsplit(label.tr(experiment), split="\\.")[[1]][2]
cellline=strsplit(label.tr(experiment), split="\\.")[[1]][3]
if(!is.null(experiment2)) cellline2=strsplit(label.tr(experiment2), split="\\.")[[1]][3]
compartment=strsplit(label.tr(experiment), split="\\.")[[1]][4]
# comment/edit this if you want to run on other cell lines
if(!grepl("K562|Gm12878|H1hesc|Helas3|Hepg2|Huvec|Nhek", cellline)){print(paste("cellline ignored:", cellline));quit('no');}
PSEUDOCOUNT=0.03
nfold=10
bM='bestbin'
LOW_SIGNAL_CUTOFF = log2(0.03)
transformation='log'
BINFILE_FOLDER = "./data/Jan11.Anshul.all.81bins.level12.L4100.HisoneDnase"; # folder path for binned density for all chromatin features
genefile="./data/gencode_v7_hg19_transcripts.gtf.cpg.tab" # gene annotation
pathname="./result/"; # path to store output files
genes=read.table(genefile, header = T)
genes=genes[(genes$end-genes$start)>4100 & genes$level<3, ]
if (file.exists(pathname) == FALSE) dir.create(pathname, showWarnings=F, recursive=T)
dir.create(paste(pathname, "data", sep="/"), showWarnings=F, recursive=T)
dir.create(paste(pathname, "data/activeTSS", sep="/"), showWarnings=F, recursive=T)
dir.create(paste(pathname, "figures/pdf", sep="/"), showWarnings=F, recursive=T)
dir.create(paste(pathname, "figures/png", sep="/"), showWarnings=F, recursive=T)
dir.create(paste(pathname, "figures/fig", sep="/"), showWarnings=F, recursive=T)
# ===============================================================
# processing the input data
# ===============================================================
DATA = paste("./data", paste(expressionDatatype,"RData", sep="."), sep="/")
if(file.exists(DATA)){
load(DATA)
} else
{
# -------------- read expression data (Y) -----------------
if(expressionDatatype == 'TSS-based')
{
data01 <- read.table("./data/Gencodev7_TSS_July2011.gff")
RNAseq=data01[,c(1,2,4,5,7, seq(10, ncol(data01), 2))]
names(RNAseq)=c("chr","source","TSS","TTS", "strand",as.vector(t(data01[1,seq(9, ncol(data01), 2)]))) # TTS is actually TSS for TSS-based files
RNAseq=RNAseq[,c(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq), invert=T)], sort(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq))]))]
RNAseq=RNAseq[RNAseq$confidence!='low',] #remove 'low' confidence ones (~5000 removed)
RNAseq=unique(RNAseq[,grep("source|confidence", names(RNAseq), invert=T)]) # [Optional] remove columan 'source' and 'confidence', and take the uniq TSS ones. (another 4000 removed)
# TSS ID (e.g. geneid.start) NB: This should be transID for Tx-based data;
rownames(RNAseq)=do.call("paste",c(RNAseq[c('gene_id','TSS')], sep="."))
y=unique(as.data.frame(cbind(TSS_id=paste(genes$gene_id, ifelse(genes$strand=='-', genes$end, genes$start), sep="."), normalizedCpG=genes$normalizedCpG))) # TSS ID (e.g. geneid.start) NB: This should be transID for Tx-based data;
RNAseq=cbind(RNAseq[match(intersect(rownames(RNAseq), y$TSS_id),rownames(RNAseq)),], normalizedCpG=y[match(intersect(rownames(RNAseq), y$TSS_id),y$TSS_id),2])
RNAseq$normalizedCpG=as.numeric(as.character(RNAseq$normalizedCpG)) # de-factor
# TILL NOW, there are 51341 lines in data, unique TSS with level=1/2 (and medium/high in Sarah's file) and length>4000
}
if(expressionDatatype == 'Tx-based')
{
data01 <- read.table("./data/gencode_v7_hg19_tr_with115_cshl_long_quantif.gff")
RNAseq=data01[,c(1,2,4,5,7, seq(10, ncol(data01), 2))]
names(RNAseq)=c("chr","source","TSS","TTS", "strand",as.vector(t(data01[1,seq(9, ncol(data01), 2)]))) # TTS is actually TSS for TSS-based files
RNAseq=RNAseq[,c(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq), invert=T)], sort(colnames(RNAseq)[grep("^[cdr].{3}[0-9]",colnames(RNAseq))]))]
rownames(RNAseq)=RNAseq[,'transcript_id']
RNAseq=cbind(RNAseq[match(intersect(rownames(RNAseq), genes$trans_id),rownames(RNAseq)),], normalizedCpG=genes[match(intersect(rownames(RNAseq), genes$trans_id),genes$trans_id),'normalizedCpG'])
RNAseq$normalizedCpG=as.numeric(as.character(RNAseq$normalizedCpG)) # de-factor
# TILL NOW, there are 61177 unique transcripts in data, with level=1/2 and length>4000
}
# merge replicates
uniqID=unique(sort(gsub("[0-9]$", "", colnames(RNAseq)[grep("^[cdr].{3}[0-9]", colnames(RNAseq))])))
RNAseq_merged=RNAseq[, grep("^[cdr].{3}[0-9]", colnames(RNAseq), invert=T)]
RNAseq_replicates=c();
for(expr in uniqID){
if(length(grep(expr, colnames(RNAseq)))==1) RNAseq_merged = cbind(RNAseq_merged, RNAseq[,grep(expr, colnames(RNAseq))])
else RNAseq_merged = cbind(RNAseq_merged, apply(RNAseq[,grep(expr, colnames(RNAseq))], 1, mean))
if(length(grep(expr, colnames(RNAseq)))==2) {
if(is.null(RNAseq_replicates)) RNAseq_replicates=RNAseq[,grep(expr, colnames(RNAseq))]
else RNAseq_replicates=cbind(RNAseq_replicates,RNAseq[,grep(expr, colnames(RNAseq))])
}
}
colnames(RNAseq_merged) = c(colnames(RNAseq)[grep("^[cdr].{3}[0-9]", colnames(RNAseq), invert=T)], uniqID)
RNAseq=RNAseq_merged
save(RNAseq, RNAseq_replicates, file=DATA)
}
## only genes >0 expression in at least one cell line (e.g. remove genes which are 0 in all cell lines), for the same Tech/RNAextract/Compartment
#exprt=unlist(strsplit(experiment, ""))
#exprt[3]='.*'
#cpcindex=grep(paste(exprt, collapse=""), colnames(RNAseq))
#cpc=RNAseq[, cpcindex]
##all0=ifelse(is.null(nrow(cpc)), cpc==0, apply(cpc>0, 1, sum)==0)
#if(is.null(nrow(cpc))) index=cpc>0 else index=apply(cpc>0, 1, sum)>0;
#if(is.null(nrow(cpc))) all0=cpc==0 else all0=apply(cpc>0, 1, sum)==0;
#if(is.null(nrow(cpc))) cpcnames=experiment else cpcnames=colnames(cpc)
#
## active TSS (for Chao/Ewan)
#exprt[3]="_"
#write.table(RNAseq[index,c(1:9,cpcindex)], paste(pathname, "data/activeTSS",paste(ifelse(grepl("TSS", expressionDatatype),"activeTSS", "activeTx"), "in", paste(exprt, collapse=""),"tab", sep="."), sep="/"), quote=F, sep="\t", row.names=F, col.names=T)
# remove those TSS with 0 in all RNAseq experiments
index=apply(RNAseq[, grep("^r.{3}", colnames(RNAseq))]>0, 1, sum)>0
TSSsubset=paste(pathname, "data/activeTSS", paste(expressionDatatype, "with_at_least_one_active_RNAseq_experiments", "tab", sep="."), sep="/")
if (file.exists(TSSsubset) == FALSE) write.table(RNAseq[index,], TSSsubset, quote=F, sep="\t", row.names = F)
## zero vs. non-zero
#X=apply(RNAseq[, grep("^[crd].{3}", colnames(RNAseq))], 2, function(x) table(x==0)/nrow(RNAseq))
##X=X[,order(X[1,])]
#png("zero.vs.nonzero.png", width=1500, height=1500);
#par(xpd=T, mar=c(5,4,4,4), mfrow=c(3,1));
#barplot(X, las=2, space=0.1, width=1, col=c('black','gray'), main="all TSS", cex.name=1.2);
#legend(1.1*ncol(X)+0.1,1,c(">0","=0"), col=c('black','gray'), pch=15, cex=1.5);
#
#index=apply(RNAseq[, grep("^c.{3}", colnames(RNAseq))]>0, 1, sum)>0
#X1=apply(RNAseq[index, grep("^[crd].{3}", colnames(RNAseq))], 2, function(x) table(x==0)/sum(index));
#X=X1[,colnames(X)]
#barplot(X, las=2, space=0.1, width=1, col=c('black','gray'), main="exclude TSS with zero in all CAGE", cex.name=1.2);
#
#index=apply(RNAseq[, grep("^r.{3}", colnames(RNAseq))]>0, 1, sum)>0
#X1=apply(RNAseq[index, grep("^[crd].{3}", colnames(RNAseq))], 2, function(x) table(x==0)/sum(index));
#X=X1[,colnames(X)]
#barplot(X, las=2, space=0.1, width=1, col=c('black','gray'), main="exclude TSS with zero in all RNAseq", cex.name=1.2);
#
#dev.off()
# [optional] histogram for expression and CpG score
Y=RNAseq[,experiment]
png(paste(pathname, "figures/fig",paste("expression.hist", experiment,"png", sep="."), sep="/"), width=1200, height=800)
par(mfrow=c(2,3))
hist(log2(Y+0.001), main=paste(label.tr(experiment), "\n(all TSS)"), xlab = "log2(expression)", cex.main=2)
legend("topright",paste(round(100*sum(Y==0)/length(Y),1), "% zero", sep=""), cex=1.5, bty='n')
h=hist(log2(Y[Y>0]+0.001), plot=F)
barplot(h$counts, names.arg=h$mids, space=0.1, cex.lab=1.5, width=1, col=8-6*localMin(h$counts), xlab="log2(expression>0)", ylab="counts",cex.main=2, main=paste(label.tr(experiment),"\n(active TSS)"))
x=tail(which(localMin(h$counts)))
if(length(x)>0) text(x=x,y=h$counts[x], label=format(h$mids[x]), po=3, cex=0.7, col='red')
#hist(log2(Y[index]+0.001), xlab="log2(expression>0 in at least one cellline)", ylab="counts", cex.lab=1.5, cex.main=2, main=paste("active TSS in at least one cell line", paste(cellline.tr(sapply(strsplit(cpcnames,""), function(x) x[3])), collapse=","), sep="\n"))
hist(log2(Y[index]+0.001), xlab="log2(expression>0 in at least one cellline)", ylab="counts", cex.lab=1.5, cex.main=2, main="active TSS in at least one RNAseq experiments")
legend("topright",paste(round(100*sum(Y[index]==0)/length(Y[index]),1), "% zero", sep=""), cex=1.5, bty='n')
normalizedCpG=RNAseq[,'normalizedCpG']
hist(normalizedCpG, freq = T, cex.lab=1.5)
hist(normalizedCpG[Y>0], freq = T, cex.lab=1.5,xlab="normalizedCpG(expression>0)")
hist(normalizedCpG[index], freq = T, cex.lab=1.5,xlab="normalizedCpG(expression>0 in at least one cellline)")
dev.off()
RNAseq=RNAseq[index, ]
if(CpG=='HCP'){
RNAseq = RNAseq[RNAseq$normalizedCpG > 0.4, ]
}
if(CpG=='LCP'){
RNAseq = RNAseq[RNAseq$normalizedCpG <= 0.4, ]
}
# add gene_name, gene_type
RNAseq=cbind(RNAseq, genes[match(RNAseq$gene_id, genes$gene_id), c('gene_type','gene_name')])
## select the representative transcript for each gene (e.g. the one with strongest expression signal)
RNAseqA=RNAseq[order(RNAseq$gene_id, -RNAseq[[experiment]]), c(experiment, 'gene_id', 'gene_type','gene_name', 'normalizedCpG')]
RNAseqA=RNAseqA[match(unique(RNAseqA$gene_id), RNAseqA$gene_id), ] # for rpkc experiment, 21414/89995 retained
### divide the gene set into training+prediction sets (for pseudocount optimization)
random=sample(nrow(RNAseqA), round(nrow(RNAseqA)/3))
# ===============================================================
# get signal for each factor (X)
# ===============================================================
filenames = paste(BINFILE_FOLDER, dir(BINFILE_FOLDER), sep="/")
# this part is for cross-cell line study.
if(!is.null(experiment2)){
## select the representative transcript for each gene (e.g. the one with strongest expression signal)
RNAseqB=RNAseq[order(RNAseq$gene_id, -RNAseq[[experiment2]]), c(experiment2, 'gene_id', 'gene_type','gene_name', 'normalizedCpG')]
RNAseqB=RNAseqB[match(unique(RNAseqB$gene_id), RNAseqB$gene_id), ] # for rpkc experiment, 21414/89995 retained
RNAseq1=RNAseqB[random, ]
RNAseq2=RNAseqB[-random, ]
dataX=c()
for (fname in filenames[grep(cellline2, filenames)])
{
hismark = sub(".*\\.(.*)", "\\1", fname) # e.g. H3k4me1
if(!grepl("^H|Dnase|Control", hismark, ignore.case=T)) next # no Pol2 and CTCF (only Histone and Control)
# read each histone modification signals
tab5rows <- read.table(fname, nrows = 200)
classes <- sapply(tab5rows, class)
histdata=read.table(fname, comment.char ="", colClasses = classes, nrows=89418)
if(expressionDatatype=='TSS-based') {
## ---for TSS-based data
histdata = histdata[,c(1:44)] # take maximally till 41bins
#histdata = unique(histdata[,c(-2, -3)]) # slow
histdata=histdata[match(unique(histdata[,1]), histdata[,1]),c(-2, -3)] ## using geneID.start as ID
histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx40
}
if(expressionDatatype=='Tx-based'){
## ---for Tx-based data # take all 81 bins
histdata=histdata[match(unique(histdata[,2]), histdata[,2]),c(-1, -3)]
histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx80
}
ptm <- proc.time()
# intersection: subset of RNAseq (unique tssid) -- for training set
itsid=intersect(rownames(RNAseq1), rownames(histdata))
RNAseq0 = RNAseq1[match(itsid, rownames(RNAseq1)), experiment2] + PSEUDOCOUNT # RNAseq0 is a vector now
histdata0 = histdata[match(itsid, rownames(histdata)), ]
# get optimized pseudocounts based on RNAseq1
pseudocount = apply(histdata0, 2, function(x) getBestPseudocount(x, RNAseq0))
# training set for classification
RNAseq_training=log2(RNAseq0)
normalizedCpG_training=RNAseq1[match(itsid, rownames(RNAseq1)), 'normalizedCpG']
histone_training=log2(t(apply(histdata0, 1, function(x) x+pseudocount)))
# apply pseudocount to the remain 2/3 dataset, and do log2 transformation on histone signal and RNAseq
itsid=intersect(rownames(RNAseq2), rownames(histdata))
RNAseq0 = log2(RNAseq2[match(itsid, rownames(RNAseq2)), experiment2] + PSEUDOCOUNT) # RNAseq0 is a vector now, actual RNAseq for further analysis
histdata0 = log2(t(apply(histdata[match(itsid, rownames(histdata)), ], 1, function(x) x+pseudocount)))
normalizedCpG0 = RNAseq2[match(itsid, rownames(RNAseq2)), 'normalizedCpG']
print(paste(fname, CpG, bM, nrow(histdata0), length(RNAseq0),(proc.time() - ptm)[[3]]));
# get the mean density of bins selected by method of 'binning method'
mean.density = getBestBinsMean(binningMethod='bestbin', Xs=histdata0, y=RNAseq0, nfold=nfold) # mean.density is a vector.
mean.density = data.frame(mean.density, row.names=itsid) # re-order rows by the original order. Necessary?
colnames(mean.density) = hismark
if(is.null(dataX)) {dataX=mean.density;} else {dataX=cbind(dataX, mean.density);}
#fname = filenames[grep(cellline, filenames)][
}
if(is.null(dataX)) {print(paste("no data found for cellline", cellline));quit('no');}
dataX=cbind(normalizedCpG=normalizedCpG0, dataX)
dataX2=dataX
RNAseq02=RNAseq0
}
RNAseq1=RNAseqA[random, ]
RNAseq2=RNAseqA[-random, ]
dataX=c()
for (fname in filenames[grep(cellline, filenames)])
{
hismark = sub(".*\\.(.*)", "\\1", fname) # e.g. H3k4me1
if(!grepl("^H|Dnase|Control", hismark, ignore.case=T)) next # no Pol2 and CTCF (only Histone and Control)
# read each histone modification signals
tab5rows <- read.table(fname, nrows = 200)
classes <- sapply(tab5rows, class)
histdata=read.table(fname, comment.char ="", colClasses = classes, nrows=89418)
if(expressionDatatype=='TSS-based') {
## ---for TSS-based data
histdata = histdata[,c(1:44)] # take maximally till 41bins
#histdata = unique(histdata[,c(-2, -3)]) # slow
histdata=histdata[match(unique(histdata[,1]), histdata[,1]),c(-2, -3)] ## using geneID.start as ID
histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx40
}
if(expressionDatatype=='Tx-based'){
## ---for Tx-based data # take all 81 bins
histdata=histdata[match(unique(histdata[,2]), histdata[,2]),c(-1, -3)]
histdata = data.frame(histdata[,-1], row.names=histdata[,1]) # only numeric data for histone singla. e.g. Nx80
}
ptm <- proc.time()
# intersection: subset of RNAseq (unique tssid) -- for training set
itsid=intersect(rownames(RNAseq1), rownames(histdata))
RNAseq0 = RNAseq1[match(itsid, rownames(RNAseq1)), experiment] + PSEUDOCOUNT # RNAseq0 is a vector now
histdata0 = histdata[match(itsid, rownames(histdata)), ]
# get optimized pseudocounts based on RNAseq1
pseudocount = apply(histdata0, 2, function(x) getBestPseudocount(x, RNAseq0))
# training set for classification
RNAseq_training=log2(RNAseq0)
normalizedCpG_training=RNAseq1[match(itsid, rownames(RNAseq1)), 'normalizedCpG']
histone_training=log2(t(apply(histdata0, 1, function(x) x+pseudocount)))
# [optional] compare the individual R for best pseudocount and fixed pseudocount
pdf(paste(pathname,"figures/fig",paste("individualR", expressionDatatype, experiment, hismark, CpG, bM, "pdf", sep="."), sep="/"), width=12, height=6)
par(mar=c(4, 5, 4, 5) + 0.1)
individualR1=cor(histone_training, RNAseq_training)
individualR2=cor(log2(t(apply(histdata0, 1, function(x) x+0.001))), RNAseq_training)
plot(individualR2, type='b', col='gray', ylim=range(c(individualR1, individualR2)), xlab="bins", ylab="individual R", main=paste("Comparison of the individual R for different pseudocount", paste(expressionDatatype, experiment,hismark, CpG, bM, sep="."),sep="\n"))
lines(individualR1, type='b', pch=19)
individualSignal=apply(histdata0, 2, mean)
abline(v=c(which.max(abs(individualR1)), which.max(abs(individualSignal))), lty=2, col=c('black', 'blue'))
#individualSignal = (individualSignal-min(individualSignal))*(max(individualR1, individualR2) - min(individualR1, individualR2))/(max(individualSignal)-min(individualSignal)) + min(individualR1, individualR2)
# draw the other Y axis (Tips: http://onertipaday.blogspot.com/2007/05/how-to-draw-plot-with-two-y-axises-and.html)
par(usr=c(par("usr")[1:2], min(individualSignal)-0.1*(max(individualSignal)-min(individualSignal)), max(individualSignal)+0.1*(max(individualSignal)-min(individualSignal))))
lines(individualSignal, type='b', pch=20, col='blue')
axis(4, col.axis='blue')
# manually label the other Y axis
mtext(side=4, line=2, 'mean signal', col='blue')
legend('topleft', c("optimized pseudocount", "fixed pseudocount (0.001)", "mean signal"), pch=c(19, 1, 20), col=c('black', 'gray', 'blue'), lty=1)
dev.off()
# apply pseudocount to the remain 2/3 dataset, and do log2 transformation on histone signal and RNAseq
itsid=intersect(rownames(RNAseq2), rownames(histdata))
RNAseq0 = log2(RNAseq2[match(itsid, rownames(RNAseq2)), experiment] + PSEUDOCOUNT) # RNAseq0 is a vector now, actual RNAseq for further analysis
histdata0 = log2(t(apply(histdata[match(itsid, rownames(histdata)), ], 1, function(x) x+pseudocount)))
normalizedCpG0 = RNAseq2[match(itsid, rownames(RNAseq2)), 'normalizedCpG']
print(paste(fname, CpG, bM, nrow(histdata0), length(RNAseq0),(proc.time() - ptm)[[3]]));
# get the mean density of bins selected by method of 'binning method'
mean.density = getBestBinsMean(binningMethod='bestbin', Xs=histdata0, y=RNAseq0, nfold=nfold) # mean.density is a vector.
mean.density = data.frame(mean.density, row.names=itsid) # re-order rows by the original order. Necessary?
colnames(mean.density) = hismark
if(is.null(dataX)) {dataX=mean.density;} else {dataX=cbind(dataX, mean.density);}
#fname = filenames[grep(cellline, filenames)][
}
if(is.null(dataX)) {print(paste("no data found for cellline", cellline));quit('no');}
# add normalizedCpG
dataX=cbind(normalizedCpG=normalizedCpG0, dataX)
print("Chromatin feature construction done!")
########## clustering
#x=dataX
#x=cbind(cl=kmeans(x, 4)$cluster, expression=RNAseq0, x)
#x=x[order(x$cl),]
#annotation=data.frame(expression=x[,ncol(x)], row.names=rownames(x))
#ann_colors = list(expression=c("lightgreen", "navy"))
#png(paste(pathname,"figures/fig",paste("clustering", expressionDatatype, experiment,'png', sep="."), sep="/"), width=1200, height=300);
#library(pheatmap)
#pheatmap(t(x[,c(-1, -2)]),cluster_cols=F, cluster_rows=T,border_color=NA, show_colnames=F, annotation=annotation, annotation_colors=ann_colors);
#dev.off();
# ==================================================
# regression model
# ==================================================
TXTfile =paste(pathname, paste(expressionDatatype, experiment, CpG, bM, nfold, "results.txt", sep="."), sep="")
HTMLfile=paste(pathname, paste(expressionDatatype, experiment, CpG, bM, nfold, "results.html", sep="."), sep="")
if(file.exists(TXTfile)) file.remove(TXTfile)
if(file.exists(HTMLfile)) file.remove(HTMLfile)
for(filtermark in c(0:9))
{
# 8: DnaseI + histones;
if(filtermark==0) filter_regexp=""
# 0: all hisone marks | Control
if(filtermark==1) filter_regexp="^H|Control"
# 1: Just promoter marks;
if(filtermark==2) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac"
# 2: Just structural marks;
if(filtermark==3) filter_regexp="H3k36me3|H3k79me2"
#3: Just "Other/Distal" marks;
if(filtermark==4) filter_regexp="H3k4me1|H4k20me1|H3k9me1"
# 8: only repressive mark
if(filtermark==5) filter_regexp="H3k27me3|H3k9me3"
# 4: Promoter+Structural marks;
if(filtermark==6) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k36me3|H3k79me2"
# 5: Promoter+Repressive marks;
if(filtermark==7) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k27me3|H3k9me3"
# 6: Promoter+Other marks;
if(filtermark==8) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k4me1|H4k20me1|H3k9me1"
# 9: all marks but not repressive marks;
if(filtermark==9) filter_regexp="H3k4me2|H3k4me3|H2az|H3k9ac|H3k27ac|H3k4me1|H4k20me1|H3k9me1|H3k36me3|H3k79me2|Control"
index=grep(filter_regexp, colnames(dataX))
if(length(index)<1) next; # to print?
datax = dataX[, index, drop=F]
# for cross-cellline comparison
datax2=datax
if(!is.null(experiment2)){
index2=grep(filter_regexp, colnames(dataX2))
if(length(index2)<1) next;
datax2 = dataX2[, index2, drop=F]
}
for(model in c('lm', 'mars', 'randomForest'))
{
print(paste("filtermark",filtermark, "; regressionModel:", model, "; nrow(datax):", nrow(datax), sep=""))
DATAfile=paste(pathname, "data", paste(expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "tab", sep="."), sep="/")
if(file.exists(DATAfile)) file.remove(DATAfile)
# ==================================================
# classification & regression model
# ==================================================
if(is.null(experiment2)) out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=nfold, shuffleY=F, shuffleX=F, swapX=F, zero=LOW_SIGNAL_CUTOFF) else
out = do_classification_regression_crossCellline(Xs=datax2, Y=RNAseq02, Xb=datax, Yb=RNAseq0, model_C='randomForest', model_R=model, nfold=nfold, zero=LOW_SIGNAL_CUTOFF) # use model trained on Xb/Yb to predict for dataset Xs/Y
# ==================================================
# measurement for overall prediction, classification and regression
# ==================================================
measured = out[[1]]; predicted = out[[2]]
topX_C = out[[3]]; topX_R = out[[4]]
predicted_C = out[[5]]; predicted_R = out[[6]];
relImp_C_metrics=out[[7]]; relImp_R_metrics=out[[8]]
measured_R_NZ = out[[9]]; predicted_R_NZ = out[[10]]
R = cor(measured, predicted)
pvalue = cor.test(measured, predicted, alternative = "greater")$p.value
RMSE=rmse(measured, predicted)
#NRMSE = RMSE / (max(measured) - min(measured)) #normalized RMSE
NRMSE = 1 - (RMSE^2)/mean((measured-mean(measured))^2)
individualR = cor(datax, rankit(RNAseq0))[,1]
cols=rep("#0000ff22", length(predicted))
#cols[predicted==log2(0.03)]="#00000011"
# mark all0 cases as red
#cols[match(intersect(rownames(datax), names(all0)[all0]), rownames(datax))]="#ff000022"
info=paste("Pearson's r =", format.value(R), "(p-value:", format.value(pvalue) , ")\nRMSE =", format.value(RMSE), "(R^2=", format.value(NRMSE), ")")
# measurement for classification and regression
library(caTools, verbose = FALSE)
diagnostics.c.url=paste("diagnostics.classifier", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "png", sep=".")
png(paste(pathname,"figures/fig",diagnostics.c.url, sep="/"), width=600, height=600)
AUC=colAUC(measured, ifelse(predicted_C>=0.5, 1, 0), plotROC=T)[1] # AUC for the classification
dev.off()
diagnostics.c.url=paste(", C", sep="")
# standard regr ession diagnostics (4-up) -- ONLY FOR REGRESSION
rR=cor(measured_R_NZ, predicted_R_NZ); #pR=format.value(cor.test(measured, out[[6]], alternative = "greater")$p.value)
rmseR=rmse(measured_R_NZ, predicted_R_NZ)
nrmseR= 1 - (rmseR^2)/mean((measured_R_NZ-mean(measured_R_NZ))^2)
diagnostics.r.url=paste("diagnostics.regressor", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "png", sep=".");
png(paste(pathname,"figures/fig",diagnostics.r.url, sep="/"), width=800, height=800)
oldpar <- par(mfrow = c(2, 2))
plot(lm(measured_R_NZ~predicted_R_NZ))
par(oldpar)
dev.off()
diagnostics.r.url=paste(", R", sep="")
otherinfo =paste("Classification: AUC=", round(AUC, 2), "\nRegression: r=", round(rR, 2),"(RMSE =", round(rmseR, 2), "; R^2 =",round(nrmseR, 2),")")
# ==================================================
# randomization analysis
# ==================================================
randomization_info=""
if(is.null(experiment2))
{
# Randomization1: shuffled Y (not Xs)
out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=1, shuffleY=T, shuffleX=F, swapX=F, zero=LOW_SIGNAL_CUTOFF)
R_randomization=round(cor(out[[1]], out[[2]]), 2); RMSE_randomization=round(rmse(out[[1]], out[[2]]), 2); types_randomization="Shuffle Y"
# Randomization2: shuffled Xs independaly (not Y)
out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=1, shuffleY=F, shuffleX=T, swapX=F, zero=LOW_SIGNAL_CUTOFF)
R_randomization=c(R_randomization, round(cor(out[[1]], out[[2]]), 2)); RMSE_randomization=c(RMSE_randomization, round(rmse(out[[1]], out[[2]]), 2)); types_randomization=c(types_randomization, "Shuffle Xs")
# Randomization3: swap X lables
out = do_classification_regression(Xs=datax, Y=RNAseq0, model_C='randomForest', model_R=model, nfold=1, shuffleY=F, shuffleX=F, swapX=T, zero=LOW_SIGNAL_CUTOFF)
R_randomization=c(R_randomization, round(cor(out[[1]], out[[2]]), 2)); RMSE_randomization=c(RMSE_randomization, round(rmse(out[[1]], out[[2]]), 2)); types_randomization=c(types_randomization, "Swap Xs lables")
randomization_info=paste("Pearson's r (RMSE) of randomization tests","----------------------------------",paste(types_randomization,": ",R_randomization," (",RMSE_randomization,")", sep="",collapse="\n"), sep="\n")
}
# ==================================================
# draw figures: scatterplot, variable importance
# ==================================================
# PNG version of transparent version
main = paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n")
if(!is.null(experiment2)) main=paste(paste(cellline,"-->",cellline2,"(", experiment,")", sep=""), paste(expressionDatatype, CpG, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n")
if(!is.null(experiment2)) filename = paste("crossCellline", expressionDatatype, experiment, "-", experiment2, CpG, bM, model, filtermark, nfold, sep=".") else
filename = paste(expressionDatatype, label.tr(experiment), CpG, bM, model, filtermark, nfold, sep=".")
draw.figure(mear=measured, pred=predicted, topX_C=topX_C, topX_R=topX_R, relImp_C_metrics, relImp_R_metrics,
dotColor=cols, info=info, otherinfo=otherinfo, randomization_info=randomization_info,
main=main, path=paste(pathname,"figures/png", sep="/"),filename=filename,filetype="png",
smoothScatter=F)
# PDF version
#source("mylib.R")
draw.figure(mear=measured, pred=predicted, topX_C=topX_C, topX_R=topX_R, relImp_C_metrics, relImp_R_metrics,
dotColor=cols, info=info, otherinfo=otherinfo, randomization_info=randomization_info,
main=main, path=paste(pathname,"figures/pdf", sep="/"),filename=filename,filetype="pdf",
smoothScatter=F)
# ==================================================
# binned by expression values (only for lm model)
# ==================================================
binimgurl="";
if(model=='lm' & filtermark==0) {
relimp.bin.by.expression(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2,
main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"),
filename=paste(pathname,"figures/fig",paste("relimp_by_expressionBins", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, 'low2high', "png", sep="."), sep="/"), decreasing=F)
binimgurl=paste("
S1", sep="")
relimp.bin.by.expression(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2,
main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"),
filename=paste(pathname,"figures/fig",paste("relimp_by_expressionBins", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, 'high2low', "png", sep="."), sep="/"), decreasing=T)
binimgurl=paste(binimgurl, ", S2", sep="")
relimp.bin.by.expression(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2,
main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"),
filename=paste(pathname,"figures/fig",paste("relimp_by_expressionBins", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, 'bined', "pdf", sep="."), sep="/"), accumulative=F)
binimgurl=paste(binimgurl, ", S3", sep="")
relimp.randomSampling(Xs=datax, Y=RNAseq0, model='lm', nfold=1, N=10, CpGset=RNAseq2,
main=paste(paste(label.tr(experiment), " (",CpG,")", sep=""), paste(expressionDatatype, bM, model, paste(nfold, "fold-CV", sep=""), sep="."), sep="\n"),
filename=paste(pathname,"figures/fig",paste("relimp_randomSampling", expressionDatatype, experiment, CpG, bM, model, filtermark, nfold, "png", sep="."), sep="/"))
binimgurl=paste(binimgurl, ", R", sep="")
}
# ==================================================
## save result into HTML/TXT
# ==================================================
topX_R=topX_R[order(-topX_R)] # sort in increasing order
topX_C=topX_C[names(topX_R)] # sort in order of topX_R
individualR=individualR[names(topX_R)] # sort in order of topX_R
res=c(expressionDatatype, experiment, tech, extract, cellline, compartment, CpG, bM, model, filtermark,
format.value(R, 'table'), format.value(pvalue, 'table'), format.value(RMSE, 'table'), format.value(NRMSE, 'table'),
format.value(AUC, 'table'), format.value(rR, 'table'), format.value(rmseR, 'table'),
paste(names(topX_R),sep="",collapse=";"),
paste(round(topX_C, 3),sep="",collapse=";"),
paste(round(topX_R, 3),sep="",collapse=";"),
paste(round(individualR, 2),sep="",collapse=";"))
write.table(t(res), file=TXTfile, append =T, quote=F, sep="\t", row.names=F, col.names=F)
# write data.frame to text
txturl=""
if(nfold>1){
dataset=data.frame(RNAseq2[match(rownames(datax), rownames(RNAseq2)),c('gene_id','gene_type','gene_name')], datax, RNAseq0, measured=measured, predicted=predicted, predicted_C=predicted_C, predicted_R=predicted_R) # be aware of the same gene order between columns
colnames(dataset) = c('gene_id','gene_type','gene_name', colnames(datax), experiment, "measured_log2", "predicted_log2", "predicted_C", "predicted_R");
write.table(format(dataset, digits = 4, nsmall=3, scientific=F, trim=T, drop0trailing=T), file=DATAfile, quote=F, sep="\t", row.names=F, col.names=T)
txturl = paste(", dataset", sep="")
}
# write to HTML
imgurl = paste("figures/png", paste(expressionDatatype, label.tr(experiment), CpG, bM, model, filtermark, nfold, "png", sep="."), sep="/")
pdfurl = paste("figures/pdf", paste(expressionDatatype, label.tr(experiment), CpG, bM, model, filtermark, nfold, "pdf", sep="."), sep="/")
imghtml="";
individualRhtml=paste(names(individualR),sep="",collapse="
")
histurl=""
if(filtermark==0) {
imghtml = paste("
Click for larger image
", sep="")
individualRhtml=paste(paste("", names(individualR),"", sep=""), sep="",collapse="
")
histurl = paste(", H", sep="")
}
cat("