heatmap3 <- function (x, Rowv = NULL, Colv = if (symm) "Rowv" else NULL, distfun = dist, hclustfun = hclust, method="complete", k.cut=1, add.expr, symm = FALSE, revC = identical(Colv, "Rowv"), scale = c("row", "column", "none"), na.rm = TRUE, margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 + 1/log10(nr), cexCol = 0.2 + 1/log10(nc), labRow = NULL, labCol = NULL, main = NULL, xlab = NULL, ylab = NULL, verbose = getOption("verbose"), ...) { ################### #Modified by H. Cho x.naive <- x x <- t(apply(x.naive, 1, standardize)) scale <- if (symm && missing(scale)) "none" else match.arg(scale) if (length(di <- dim(x)) != 2 || !is.numeric(x)) stop("`x' must be a numeric matrix") nr <- di[1] nc <- di[2] if (nr <= 1 || nc <= 1) stop("`x' must have at least 2 rows and 2 columns") if (!is.numeric(margins) || length(margins) != 2) stop("`margins' must be a numeric vector of length 2") doRdend <- !identical(Rowv, NA) doCdend <- !identical(Colv, NA) if (is.null(Rowv)) Rowv <- rowMeans(x, na.rm = na.rm) if (is.null(Colv)) Colv <- colMeans(x, na.rm = na.rm) if (doRdend) { if (inherits(Rowv, "dendrogram")) ddr <- Rowv else { #modified by H.Cho Oct 27, 2004 #hcr <- hclustfun(distfun(x)) hcr <- hclustfun(distfun(x), method=method) #Cut a tree by H. Cho hcr.cut <- cutree(hcr, k=k.cut) ddr <- as.dendrogram(hcr) if (!is.logical(Rowv) || Rowv) ddr <- reorder(ddr, Rowv) } if (nr != length(rowInd <- order.dendrogram(ddr))) stop("row dendrogram ordering gave index of wrong length") } else rowInd <- 1:nr if (doCdend) { if (inherits(Colv, "dendrogram")) ddc <- Colv else if (identical(Colv, "Rowv")) { if (nr != nc) stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") ddc <- ddr } else { #modified by H.Cho Oct 27, 2004 #hcc <- hclustfun(distfun(if (symm) x else t(x))) hcc <- hclustfun(distfun(if (symm) x else t(x)), method=method) ddc <- as.dendrogram(hcc) if (!is.logical(Colv) || Colv) ddc <- reorder(ddc, Colv) } if (nc != length(colInd <- order.dendrogram(ddc))) stop("column dendrogram ordering gave index of wrong length") } else colInd <- 1:nc x <- x[rowInd, colInd] if (is.null(labRow)) labRow <- if (is.null(rownames(x))) (1:nr)[rowInd] else rownames(x) if (is.null(labCol)) labCol <- if (is.null(colnames(x))) (1:nc)[colInd] else colnames(x) if (scale == "row") { x <- sweep(x, 1, rowMeans(x, na.rm = na.rm)) sx <- apply(x, 1, sd, na.rm = na.rm) x <- sweep(x, 1, sx, "/") } else if (scale == "column") { x <- sweep(x, 2, colMeans(x, na.rm = na.rm)) sx <- apply(x, 2, sd, na.rm = na.rm) x <- sweep(x, 2, sx, "/") } lmat <- rbind(c(NA, 3), 2:1) lwid <- c(if (doRdend) 1 else 0.05, 4) lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0, 4) if (!missing(ColSideColors)) { if (!is.character(ColSideColors) || length(ColSideColors) != nc) stop("'ColSideColors' must be a character vector of length ncol(x)") lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1) lhei <- c(lhei[1], 0.2, lhei[2]) } if (!missing(RowSideColors)) { if (!is.character(RowSideColors) || length(RowSideColors) != nr) stop("'RowSideColors' must be a character vector of length nrow(x)") lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), 1), lmat[, 2] + 1) lwid <- c(lwid[1], 0.2, lwid[2]) } lmat[is.na(lmat)] <- 0 if (verbose) { cat("layout: widths = ", lwid, ", heights = ", lhei, "; lmat=\n") print(lmat) } op <- par(no.readonly = TRUE) on.exit(par(op)) layout(lmat, widths = lwid, heights = lhei, respect = TRUE) if (!missing(RowSideColors)) { par(mar = c(margins[1], 0, 0, 0.5)) image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) } if (!missing(ColSideColors)) { par(mar = c(0.5, 0, 0, margins[2])) image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) } par(mar = c(margins[1], 0, 0, margins[2])) if (!symm || scale != "none") x <- t(x) if (revC) { iy <- nr:1 ddr <- rev(ddr) x <- x[, iy] } else iy <- 1:nr ################################# #Modified by H. Cho x.naive <- x.naive[rowInd, colInd] image(1:nc, 1:nr, t(x.naive), xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "", ylab = "", ...) axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, cex.axis = cexCol) if (!is.null(xlab)) mtext(xlab, side = 1, line = margins[1] - 1.25) axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, cex.axis = cexRow) if (!is.null(ylab)) mtext(ylab, side = 4, line = margins[2] - 1.25) if (!missing(add.expr)) eval(substitute(add.expr)) par(mar = c(margins[1], 0, 0, 0)) if (doRdend) plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") else frame() par(mar = c(0, 0, if (!is.null(main)) 1 else 0, margins[2])) if (doCdend) plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") else if (!is.null(main)) frame() if (!is.null(main)) title(main, cex.main = 1.5 * op[["cex.main"]]) invisible(list(rowInd = rowInd, colInd = colInd, hcr=hcr, ddr=ddr, hcr.cut = hcr.cut)) } ### for gree-black-red color heatmap odd <- function(x) x!=as.integer(x/2)*2 even <- function(x) x==as.integer(x/2)*2 # Generate a set of n colors which smoothly transition # from 'low' to 'mid' to 'high'. colorpanel <- function(n,low='green',mid='black',high='red') { if(even(n)) warning("n is even: colors panel will not be symmetric") # convert to rgb low <- col2rgb(low) mid <- col2rgb(mid) high <- col2rgb(high) # determine length of each component lower <- floor(n/2) upper <- n - lower red <- c( seq(low[1,1], mid [1,1], length=lower), seq(mid[1,1], high[1,1], length=upper) )/255 green <- c( seq(low[3,1], mid [3,1], length=lower), seq(mid[3,1], high[3,1], length=upper) )/255 blue <- c( seq(low[2,1], mid [2,1], length=lower), seq(mid[2,1], high[2,1], length=upper) )/255 rgb(red,blue,green) } # Generate green-black-red colorscale (green= low, red=high) greenred <- function(n) colorpanel(n, 'green', 'black', 'red' ) # (9/8/05 Chang et al. (2003) breast cancer docetaxel study) preprocess.na = function (x, data.type = "MAS5", threshold = 1, LOWESS = FALSE) { x <- as.matrix(na.pass(x)) if (data.type == "MAS4" || data.type == "MAS5") { x <- quartile.normalize(x, percent = 50) } if (data.type == "MAS4" || data.type == "MAS5" || data.type == "dChip") { if (length(x[x < threshold]) != 0) { x[x < threshold] <- threshold } } x <- logb(x, 2) if (LOWESS) { y <- matrix(NA, nrow = nrow(x), ncol = ncol(x)) y[, 1] <- x[, 1] for (i in 2:ncol(x)) { y[, i] <- lowess.normalize(x[, 1], x[, i]) } x <- y } return(x) } # suggestive code to read many files with similar names all = matrix(0, 504, 10000) for (i in 1:254) { file.name = paste("C:/Documents and Settings/HES795/","ALL_",i,".txt",sep="") tmp = read.table(file=file.name, header=F, skip=57, sep="\t") all[,c(2*i-1, 2*i)] = tmp[,c(1,8)] } # check whether all genes are matched for (i in 2:254) {print(c(i, sum(all[,1] == all[,2*i-1]) env.vec <- c(hgu95av2SYMBOL, hgu95av2GENENAME, hgu95av2UNIGENE, hgu95av2LOCUSID, hgu95av2CHRLOC, hgu95av2GO) #subset_description = docetaxel sensitive tumor #subset_sample_id = GSM4903,GSM4907,GSM4908,GSM4914,GSM4915,GSM4917,GSM4919,GSM4920,GSM4921,GSM4923 #subset_description = docetaxel resistant tumor #subset_sample_id = GSM4901,GSM4902,GSM4904,GSM4905,GSM4906,GSM4909,GSM4910,GSM4911,GSM4912,GSM4913,GSM4916,GSM4918,GSM4922,GSM4924 # read soft data from GEO # chang.dat = read.table(file="C:/Documents and Settings/Jae K. Lee/My Documents/tillinghast/Chang/GDS360.soft", header=F, skip=57, sep="\t") chang.dat[1:2,] mode(chang.dat) class(chang.dat) attributes(chang.dat) chang = as.matrix(chang.dat[-1, -c(1:2)]) chang[1:2,] mode(chang) class(chang) attributes(chang)[[2]][2] chang = matrix(as.numeric(chang), ncol=24) #colnames(chang) = c(as.matrix(chang.dat[1,3:26])) colnames(chang) = c(paste("N",1:14, sep=""), paste("R",1:10, sep="")) rownames(chang) = chang.dat[-1, 1] chang[1:2,] mode(chang) class(chang) attributes(chang)[[2]][2] # final matrix-form data: chang # MS-LC proteomics data for before & after anticancer compound treatment #ms.dat = read.table(file="C:/Documents and Settings/Jae K. Lee/My Documents/tillinghast/Chang/LCMS.txt", header=F, sep="\t") ms = matrix(as.numeric(as.matrix(ms.dat[-1,4:9])), ncol=6) colnames(ms) = c("r1u", "r2u", "r3u", "r1t", "r2t", "r3t") rownames(ms) = paste("p",1:nrow(ms), sep="") ms = ms[,c(1,4,2,5,3,6)] # final matrix-form data: ms ### Dispersion, correlation, and graphical examination before log transformation # chang microarray data and LC-MS proteomics data apply(chang, 2, summary) boxplot(data.frame(chang)); boxplot(data.frame(log(chang))) pairs(chang[1:1000,], lower.panel=NULL, pch=".") #pairs(chang[,1:7], lower.panel=NULL, pch=".") options(width=210) round(cor(chang), 2) round(apply(ms, 2, summary), 1) boxplot(data.frame(ms)); boxplot(data.frame(log(ms))) pairs(ms, lower.panel=NULL, pch="*") round(cor(ms,use="pairwise.complete.obs"), 2) # example of correlation with outliers or untrasformed data tmp = matrix(runif(20), 10, 2) cor(tmp); plot(tmp, pch="*") tmp[10,] = c(10,10) cor(tmp); plot(tmp, pch="*") ## -> robust statistics and bootstrap ### Inter-quartile-range (IQR) or lowess (local non-parametric) normalization library(LPE) chang = preprocess(chang, LOWESS=F) apply(chang, 2, summary) boxplot(data.frame(chang)) pairs(chang[1:1000,], lower.panel=NULL, pch=".") round(cor(chang), 2) ms = preprocess.na(ms, LOWESS=F) apply(ms, 2, summary) boxplot(data.frame(ms)) pairs(ms, lower.panel=NULL, pch="*") round(cor(ms,use="pairwise.complete.obs"), 2) boxplot(data.frame(quartile.normalize(ms))) ## non-linear relationship #nonlin.dat = read.table(file="C:/Documents and Settings/Jae K. Lee/My Documents/tillinghast/Chang/HYPvSHAM.txt", header=F, sep="\t") nonlin = matrix(as.numeric(as.matrix(nonlin.dat[-1,c(2,5)])), ncol=2) colnames(nonlin) = c("HYP", "SHAM") plot(log2(nonlin), pch="*") abline(b=1, a=0) nonlin = preprocess(nonlin) plot(nonlin, pch="*") abline(b=1, a=0) ### sensitivity (or reproducibility) and specificity # chang data within.cor = c(cor(chang[,1:14])[lower.tri(diag(1,14))], cor(chang[,15:24])[lower.tri(diag(1,10))]) spec = mean(within.cor) between.cor = cor(chang[,1:14], chang[,15:24]) sen = spec- mean(between.cor) # ms data tmp=ms[,c(1,3,5,2,4,6)] within.cor = c(cor(tmp[,1:3], use="pairwise.complete.obs")[lower.tri(diag(1,3))], cor(tmp[,4:6], use="pairwise.complete.obs")[lower.tri(diag(1,3))]) spec = mean(within.cor) between.cor = cor(tmp[,1:3], chang[,4:6], use="pairwise.complete.obs") sen = spec - mean(between.cor) # --> Need of paired statistical inference ## specificity and sensitivity in t.test and wilcox.test simul.dat = matrix(rnorm(20000),ncol=10); simul.dat[1001:2000,6:10] = matrix(rnorm(5000, mean=1), ncol=5) t.lst = wilcox.lst = rep(0, 2000) for (i in 1:2000) { t.lst[i] = t.test(simul.dat[i,6:10], simul.dat[i,1:5])$statistic } for (i in 1:2000) { wilcox.lst[i] = wilcox.test(simul.dat[i,6:10], simul.dat[i,1:5])$statistic } summary(t.lst) Min. 1st Qu. Median Mean 3rd Qu. Max. -3.8460 -0.1013 0.8034 0.8584 1.7270 7.9260 summary(wilcox.lst) Min. 1st Qu. Median Mean 3rd Qu. Max. 0.00 12.00 16.00 15.75 20.00 25.00 # ROC curve calculation by varying the cutoff values of t.lst and wilcox.lst (FPR and FNR) quan.t = quantile(t.lst, probs=1:100*.01) quan.wilcox = quantile(wilcox.lst, probs=1:100*.01) t.count = wilcox.count = matrix(0, 100,2) for (i in 1:100) { t.count[i,1] = sum(t.lst[1:1000] > quan.t[i]); t.count[i,2] = sum(t.lst[1001:2000] > quan.t[i]) wilcox.count[i,1] = sum(wilcox.lst[1:1000] > quan.wilcox[i]); wilcox.count[i,2] = sum(wilcox.lst[1001:2000] > quan.wilcox[i]) } plot(t.count[100:1,]/1000, type="l", lty=1, col=1, xlab="specificity (FPR)", ylab="sensitivity (1-FNR)") lines(wilcox.count[100:1,]/1000, lty=2, col=2) title(main="ROC Curves of t-test and Wilcoxon test") legend(0.6, 0.7, c("t.test", "Wilcoxon"), lty=c(1,2), lwd=3, cex=1.2, col=c(1:2)) # two-sample t-test and q-value options(object.size=1e20) # two-sample t-test and q-value x=chang[,1:14]; y=chang[,15:24] # Student's two-sample t-test tl = c() for (i in 1:nrow(x)) { tl = c(tl, t.test(x[i,], y[i,])$p.value) } x.mean = apply(x, 1, mean, na.rm=T); y.mean = apply(y, 1, mean, na.rm=T) x.var = apply(x, 1, var, na.rm=T); y.var = apply(y, 1, var, na.rm=T) t.stat = (x.mean-y.mean)/sqrt(x.var/ncol(x)+y.var/ncol(y)) t.pval = pnorm(t.stat) t.pval = apply( cbind(t.pval*2, 2*(1-t.pval)), 1, min) library(qvalue) library(help=qvalue) ?qvalue vignette("qvalue") q.val = qvalue(t.pval)$qvalues ttt = cbind(x.mean, x.var, y.mean, y.var, t.stat, t.pval, q.val) for (i in 1:30) {print(c(i,sum(q.val < 0.001*i)))} gene.ttest = (1:nrow(chang))[q.val < 0.001] # length(gene.ttest) [1] 144 # LPE test library(LPE) library(help=LPE) ?lpe ?fdr.adjust ?mt.rawp2adjp vignette() vignette("LPE") var.res = baseOlig.error(x) var.nonres = baseOlig.error(y) lpe.out = lpe(x,y, var.res, var.nonres, probe.set.name=rownames(blad40)) #lpe.fdr = fdr.adjust(lpe.out, adjp="resamp", iteration=2) round(lpe.out[rev(order(abs(lpe.out$z.stats)))[1:20],c(1:17,20:36)],2) gene.subset1 = (1:22215)[abs(lpe.out$z.stat) > 2] ### Unsupervised clustering analysis normalize = function(x) { (x-mean(x,na.rm=T))/sqrt(var(x,na.rm=T)) } chang.n = apply(chang, 1, normalize); chang.n = t(chang.n) chang.n = chang.n[gene.ttest,] options(object.size=1e20) # hierarchical clustering heatmap(chang.n) source("C://Documents and Settings//Jae K. Lee//My Documents//acs grant//hynes//heatmap3.R") source("C://Documents and Settings//Jae K. Lee//My Documents//acs grant//hynes//color.R") heatmap3(chang.n, method="average", col = greenred(255), scale="none", margin=c(5,10), cexCol=.35, cexRow=.45, main="") # kmeans clustering cl <- kmeans(chang.n, 2) plot(chang.n, col = cl$cluster) points(cl$centers, col = 1:2, pch = 8, cex=2) ## random starts do help here with too many clusters cl <- kmeans(chang.n, 3, nstart = 25) plot(chang.n, col = cl$cluster) points(cl$centers, col = 1:5, pch = 8) library(cclust) library(help=cclust) ## a 3-dimensional example x<-rbind(matrix(rnorm(150,sd=0.3),ncol=3), matrix(rnorm(150,mean=1,sd=0.3),ncol=3), matrix(rnorm(150,mean=2,sd=0.3),ncol=3)) cl<-cclust(x,6,20,verbose=TRUE,method="kmeans") plot(x, col=cl$cluster) ## assign classes to some new data y<-rbind(matrix(rnorm(33,sd=0.3),ncol=3), matrix(rnorm(33,mean=1,sd=0.3),ncol=3), matrix(rnorm(3,mean=2,sd=0.3),ncol=3)) ycl<-predict(cl, y) par(mfrow=c(1,2)) plot(x, col=cl$cluster) plot(y, col=ycl$cluster) # self-organizing map library(som) library(help=som) ?som data(yeast) yeast <- yeast[, -c(1, 11)] yeast.f <- filtering(yeast) yeast.f.n <- normalize(yeast.f) som.yeast <- som(yeast.f.n, xdim=5, ydim=6) #som.yeast <- som(yeast.f.n, xdim=5, ydim=6, topol="hexa", neigh="gaussian") plot(som.yeast) cl <- som(chang.n, xdim=3, ydim=4) plot(cl) ### Supervised learning: classification # support vector machine library(e1071) library(help=e1071) data(iris) attach(iris) # visualize (classes by color, SV by crosses): plot(cmdscale(dist(iris[,-5])), col = as.integer(iris[,5]), pch = c("o","+")[1:150 %in% model$index + 1]) ## classification mode # default with factor response: model <- svm(Species ~ ., data = iris) # alternatively the traditional interface: x <- subset(iris, select = -Species) y <- Species model <- svm(x, y) print(model) summary(model) # test with train data pred <- predict(model, x) # (same as:) pred <- fitted(model) # Check accuracy: table(pred, y) ## test with independent set iris.s = iris[c(1:30,51:80,101:130),] iris.i = iris[-c(1:30,51:80,101:130),] Species.s = Species[c(1:30,51:80,101:130)] model <- svm(Species.s ~ ., data = iris.s) # Check accuracy in training pred <- fitted(model) y <- Species.s table(pred, y) # test with independent data x.i = subset(iris.i, select = -Species) pred <- predict(model, x.i) #### NOT working!! # MiPP: misclassification penalized posterior library(MiPP) library(help=MiPP) gene.ttest = (1:nrow(chang))[q.val < 0.0001] # length(gene.ttest) [1] 22 chang.s = chang[gene.ttest,] chang.s = mipp.preproc(chang.s) chang.factor = factor(c(rep("non",14), rep("res",10)), levels = c("non","res")) out.LDAtFDR0001 = mipp(x=chang.s, y=chang.factor, n.split=20, percent.cut=1, n.split.eval=100, rule="lda") out.SVMrbf.tFDR0001 = mipp(x=chang.s, y=chang.factor, n.split=20, percent.cut=1, n.split.eval=100, rule="svmrbf") out.seqLDAtFDR0001 = mipp.seq(x=chang.s, y=chang.factor, n.split=20, percent.cut=1, n.split.eval=100, rule="lda") # validation best1 = c(6,3,2) lda.best1 = lda(t(chang.s[best1,c(1:7,15:19)]), grouping=chang.factor[c(1:7,15:19)]) lda.best1.out.t = predict(lda.best1, t(chang.s[best1,c(1:7,15:19)]) ) round(lda.best1.out.t$posterior, 2) lda.best1.out.i = predict(lda.best1, t(chang.s[best1,-c(1:7,15:19)]) ) round(lda.best1.out.i$posterior, 2)