cvscores               package:hddplot               R Documentation

_F_o_r _h_i_g_h-_d_i_m_e_n_s_i_o_n_a_l _d_a_t_a _w_i_t_h _k_n_o_w_n _g_r_o_u_p_s, _d_e_r_i_v_e _s_c_o_r_e_s _f_o_r _p_l_o_t_t_i_n_g

_D_e_s_c_r_i_p_t_i_o_n:

     This is designed to used with the output from 'cvdisc'.  Test and
     training scores from successive cross-validation steps determine,
     via a principal components calculation, a low-dimensional global
     space onto which test scores are projected, in order to plot them.

_U_s_a_g_e:

     cvscores(cvlist = BMonly.cv, nfeatures = 3, ndisc = NULL, cl.other = factor("PB:f"), x.other = Golub.PBf, keepcols = NULL, print.progress = TRUE)

_A_r_g_u_m_e_n_t_s:

  cvlist: Output object from 'cvdisc'

nfeatures: Number of features to use

   ndisc: Dimension of space in which scores will be formed, at most
          one less than the number of groups

cl.other: Classifies additional observations that are to be projected
          onto the same low-dimensional space

 x.other: Matrix from which additional observations will be taken

keepcols: Number of sets of principal component scores to use in
          discriminant calculations and consequent evaluation of scores
          that will determine the low-dimensional global space

print.progress: Set to 'TRUE' (default) for printing out, as
          calculations proceed, the number of the current fold

_D_e_t_a_i_l_s:

_V_a_l_u_e:

  scores: Scores that can be plotted

      cl: Factor that was used to classify observations into groups

other.scores: Other scores, if any, for plotting

cl.other: Factor that was used to classify the 'other' data into groups

nfeatures: Number of features used

_N_o_t_e:

     The methodology used here has developed beyond that described in
     Maindonald and Burden (2005)

_A_u_t_h_o_r(_s):

     John Maindonald

_R_e_f_e_r_e_n_c_e_s:

     Maindonald, J.H. and Burden, C.J., 2005. Selection bias in plots
     of microarray or other data that have been sampled from a
     high-dimensional space. In R. May and A.J. Roberts, eds., {\em
     Proceedings of 12th Computational Techniques and Applications
     Conference CTAC-2004}, volume 46, pp. C59-C74. 

     <URL: http://anziamj.austms.org.au/V46/CTAC2004/Main> [March 15,
     2005]

_S_e_e _A_l_s_o:

     See also 'cvdisc', 'scoreplot'

_E_x_a_m_p_l_e_s:

     ## Use first 500 rows (expression values) of Golub, for demonstration.
     data(Golub)
     data(golubInfo)
     attach(golubInfo)
     miniG.BM <- Golub[1:500, BM.PB=="BM"]  # 1st 500 rows only
     cancer.BM <- cancer[BM.PB=="BM"] 
     miniG.cv <- cvdisc(miniG.BM, cl=cancer.BM, nfeatures=1:10,
                         nfold=c(3,1))
     miniG.scores <- cvscores(cvlist=miniG.cv, nfeatures=4, 
                              cl.other=NULL) 
     detach(golubInfo) 

     ## The function is currently defined as
     function(cvlist=BMonly.cv, nfeatures=3, ndisc=NULL,
                cl.other=factor("PB:f"), x.other=Golub.PBf,
                keepcols=NULL, print.progress=TRUE
                ){
         library(MASS)
         foldids <- cvlist$foldids
         nfold <- c(length(unique(foldids)), dim(foldids)[2])

         ugenes <- unique(as.vector(cvlist$genelist[1:nfeatures, ,]))
         df <- cvlist$xUsed[, ugenes]
         cl <- cvlist$cl
         if(!length(cl)==dim(df)[1])
           stop(paste("length(cl) =", length(cl),"does not equal",
                      "dim(cvlist$df)[1] =", dim(df)[1]))
         levnames <- levels(cl)
         if(is.null(ndisc))ndisc <- length(levnames)-1
         ngp <- length(levnames)
         nobs <- dim(df)[1]
         allscores <- array(0, dim=c(nrow=nobs, ncol=ndisc*nfold[1], nleaf=nfold[2]))
         if(!is.null(cl.other)){
           cl.other <- factor(cl.other)
           if(is.null(dim(x.other)))stop("x.other must have dimension 2")
           if(!length(cl.other)==dim(x.other)[2])
             stop(paste("length(cl.other) =", length(cl.other),"does not equal",
                        "dim(x.other)[2] =", dim(x.other)[2]))
           df.other <- data.frame(t(x.other[ugenes, ,drop=FALSE]))
           colnames(df.other) <- ugenes
         }
         else other.scores <- NULL
         for(k in 1:nfold[2]){
           foldk <- foldids[,k]
           ufold <- sort(unique(foldk))      
           j <- 0
           for(i in ufold){
             j <- j+1
             if(print.progress)cat(paste(if(j>1) ":" else "", i,sep=""))
             testi <- (1:nobs)[foldk==i]
             traini <- (1:nobs)[foldk!=i]
             ntest <- length(testi)
             ntrain <- nobs-ntest
             genes.i <- cvlist$genelist[1:nfeatures, i, k]
             dfi <- as.data.frame(df[-testi, genes.i, drop=FALSE])
             newdfi <- as.data.frame(df[testi, genes.i, drop=FALSE])
             cli <- cl[-testi]
             xy.xda <- lda(cli~., data=dfi)
             allscores[, ((i-1)*ndisc)+(1:ndisc), k] <-
               predict(xy.xda, newdata=df, dimen=ndisc)$x
           }
         }
         cat("\n")
         dim(allscores) <- c(nobs, ndisc*prod(nfold))
         if(is.null(keepcols))keepcols <- min(nfeatures, dim(allscores)[2])
         allscores.pcp <- data.frame(pcp(allscores, varscores=FALSE)$g[, 1:keepcols])
         globals <- predict(lda(cl ~ ., data=allscores.pcp))$x[,1:ndisc]
         fitscores <- array(0, dim=c(nrow=nobs, ncol=ndisc, nleaf=nfold[2]))
         for(k in 1:nfold[2]){
           foldk <- foldids[,k]
           ufold <- sort(unique(foldk))    
     ##      ntimes.genes <- table(cvlist$genelist[1:nfeatures,,k])
           av <- colMeans(df)
           j <- 0
           for(i in ufold){
             j <- j+1
             cat(paste(if (j>1) ":" else "", i,sep=""))
             testi <- (1:nobs)[foldk==i]
             traini <- (1:nobs)[foldk!=i]
             genes.i <- cvlist$genelist[1:nfeatures, i, k]
             dfi <- data.frame(df[-testi, genes.i, drop=FALSE])
             newdfi <- data.frame(df[testi, genes.i, drop=FALSE])
             cli <- cl[-testi]
             traini.xda <- lda(cli~., data=dfi)
             scorei <- predict(traini.xda)$x[,1:ndisc]
             newpred.xda <- predict(traini.xda, newdata=newdfi)
             scorei.out <- newpred.xda$x[, 1:ndisc, drop=FALSE]
             scorei.all <- globals[-testi, 1:ndisc]
             avcol <- colMeans(scorei.all)
             scorei.all <- sweep(scorei.all, 2, avcol,"-")
             avi <- colMeans(scorei)
             scorei <- sweep(scorei, 2, avi,"-")
             trans <- qr.solve(scorei, scorei.all)
             scorei.out <- sweep(scorei.out, 2, avi, "-")
             fitscores[testi, , k] <- sweep(scorei.out%*%trans, 2, avcol, "+")
           }
         }
         fitscores <- apply(fitscores, 1:2, mean)
         
         if(!is.null(cl.other)){
           Fmatrix <- cvlist$Fmatrix
           ord <- order(Fmatrix)[1:nfeatures]
           rowcol <- cbind(as.vector(row(Fmatrix))[ord],as.vector(col(Fmatrix))[ord])
           ugenes <- unique(as.vector(cvlist$genelist[rowcol]))
           df <- cvlist$xUsed[, ugenes]
           xy.xda <- lda(cl~., data=df)
           train.scores <- predict(xy.xda, dimen=ndisc)$x
           other.scores <- predict(xy.xda, newdata=df.other,
                                   dimen=ndisc)$x
           avcol <- colMeans(globals)
           all.scores <- sweep(globals, 2, avcol,"-")
           av.train <- colMeans(train.scores)
           train.scores <- sweep(train.scores, 2, av.train, "-")
           trans <- qr.solve(train.scores, all.scores)
           other.scores <- sweep(other.scores%*%trans, 2, avcol, "+")
         }
         if(print.progress)cat("\n")
         invisible(list(scores=fitscores, cl=cl, other=other.scores,
                        cl.other=cl.other, nfeatures=nfeatures))
       }

