###------------------------------------------### ### Script file to be used to carry out SCVD ### ###------------------------------------------### # Attach the MASS library to use the QDA function library(MASS) # Create a function that sorts a matrix (sort.col in Splus) # Code was posted in the R archives by Renaud Lancelot SortMat <- function(Mat, Sort) { m <- do.call("order", as.data.frame(Mat[, Sort])) Mat[m, ] } ###-------------------### ### The SCVD function ### ###-------------------### x <- NULL init.qda <- function(x){ x <- cbind(x, gene) if(mode(x) == "numeric") x <- as.matrix(x) lev.y <- levels(as.factor(grp)) qd <- try( qda(cbind(x), grp, cv=T) ) if(class(qd) != "Error"){ pred.qda <- predict(qd) c.rate <- as.integer(sum(grp==pred.qda$class)) # Calculation of the cumulative classification posterior probabilities post <- data.frame(pred.qda$post) names(post) <- lev.y prob <- NULL for(i in 1:length(lev.y)){ prob[[i]] <- list(post[grp==names(post[i]), i]) } out <- list(c(c.rate, prod(unlist(prob)))) } else{ out <- list(c("NA", "NA")) } } SCVD <- function(XX){ tab <- apply(XX, 2, FUN=function(x)init.qda(x)) result <- data.frame(t(matrix(unlist(tab), nrow=2))) names(result) <- c("Correct", "Posterior") result } ######################################################################## # At this stage, it is assumed that the data set has been normalized, # rows correspond to observations and the columns correspond to genes # (i.e. the original matrix is trasposed). # Refer to Preparing the Data to put the data into the same form # as used in the manuscript. # The training data set is referred to as train and # the independent data set is defined as indep. ### Start the search for gene models ### ## One-gene Model ## grp <- c(rep(0,27), rep(1,11)) gene <- NULL run1 <- SCVD(train) df.sort1 <- SortMat(Mat=run1, Sort=c(1,2)) # V4847, had a CR = 37. Next best was 36. geneI <- as.numeric(row.names(df.sort1))[7129] ## Construct a Two-gene Model ## grp <- c(rep(0,27), rep(1,11)) gene <- train[,geneI] run2 <- SCVD(train[,-geneI]) df.sort2 <- SortMat(Mat=run2, Sort=c(1,2)) geneII <- as.numeric(row.names(df.sort2))[7128] geneII <- ifelse(geneII < geneI, geneII, geneII+1) ###-----------------------------------------------------------### ### Now, check the performance on the independent data set ### ###-----------------------------------------------------------### grpf <- factor(grp, levels=0:1, labels = c("ALL", "AML")) igrp <- c(rep(0,20), rep(1,14)) igrpf <- factor(igrp, levels=c(0,1), labels=c("ALL", "AML")) G21 <- as.matrix(train[,c(geneI, geneII)]) f21 <- qda(G21, grpf, cv=T) pred21 <- predict(f21) CR21 <- sum(pred21$class[1:27]=="ALL") + sum(pred21$class[28:38]=="AML") CCPP21 <- (prod(pred21$posterior[1:27,1]) * prod(pred21$posterior[28:38,2])) print(round(c(CR21, CCPP21),4)) # Check its performance on the independent set iG21 <- as.matrix(indep[,c(geneI, geneII)]) ip21 <- predict(f21, iG21) CR21i <- sum(ip21$class[1:20] == "ALL") + sum(ip21$class[21:34] == "AML") CCPP21i <- (prod(ip21$posterior[1:20])*prod(ip21$posterior[21:34])) print(round(c(CR21i, CCPP21i),4)) # Output of the results out.df <- data.frame(igrpf, ip21$class, ip21$post) names(out.df) <- c("Actual", "Predicted", "P(ALL)", "P(AML)") write.table(out.df, file="res2x.csv", sep=",", row.names=F)