###-------------------------------------------------------### ### The code below will help you import the data set into ### ### R so further analysis can be done. ### ###-------------------------------------------------------### # First download the two data sets from # http://www-genome.wi.mit.edu/cgi-bin/cancer/datasets.cgi # Delete all columns (and rows) without any expression values. # This should result in matrics consisting of only the expression # values. Save the data sets as comma separated files (CSV) and name # the train and independent data sets as train and indep, respectively # on the C drive in a folder named temp. tXs <- read.table(file="C:/temp/train.csv", sep=",", header=F) iXs <- read.table(file="C:/temp/indep.csv", sep=",", header=F) # Arrange the data so that genes are the columns not rows. tXs <- as.matrix(t(tXs)) # Matrix is now 38 x 7129 iXs <- as.matrix(t(iXs)) # Matrix is now 34 x 7129 full <- rbind(tXs, iXs) # Matrix is 72 x 7129 # Threshold values at 30 full[full < 30] <- 30 # Normalize the data Xs.rank <- apply(full, 1, order) Xs.rank.var <- apply(Xs.rank, 1, var) # variance vector Invariant.gene <- (1:7129)[Xs.rank.var < median(Xs.rank.var)] # Above keeps only 50% invariant genes for baseline normalization logX <- log(full) xbar.vec <- apply(logX[,Invariant.gene], 1, FUN=mean) normalize <- function(x){ (x - mean(x))/sqrt(var(x)) } logXmat <- as.data.frame(logX - xbar.vec) logXmat <- apply(logXmat, 2, normalize) # Separate into the training and independent data sets train <- logXmat[1:38,] indep <- logXmat[39:72,]