accTrainTest             package:hddplot             R Documentation

_T_w_o _s_u_b_s_e_t_s _o_f _d_a_t_a _e_a_c_h _t_a_k_e _i_n _t_u_r_n _t_h_e _r_o_l_e _o_f _t_e_s_t _s_e_t

_D_e_s_c_r_i_p_t_i_o_n:

     A division of data is specified, for use of linear discriminant
     analysis, into a training and test set. Feature selection and
     model fitting is  formed, first with I/II as training/test, then
     with II/I as training/test

_U_s_a_g_e:

     accTrainTest(x = Golub.BM, cl = cancer.BM, traintest = gp.id, nfeatures = NULL, print.acc = FALSE)

_A_r_g_u_m_e_n_t_s:

       x: Matrix; rows are features, and columns are observations 
          ('samples')

      cl: Factor that classifies columns into groups that will classify
          the data for purposes of discriminant calculations

traintest: Values that specify a division of observations into two
          groups. In the first pass (fold), one to be training and the
          other test, with the roles then reversed in a second pass or
          fold.

nfeatures: integer: numbers of features for which calculations are
          required

print.acc: logical: should accuracies be printed?

_D_e_t_a_i_l_s:

_V_a_l_u_e:

  sub1.2: row numbers of features, by order of values of the group 
          separation measure, for the first subset (I) of the 'x'

  acc1.2: accuracies, with I as training set and II as test

  sub2.1: row numbers of features, by order of values of the group 
          separation measure, for the second subset (II) of the 'x'

  acc2.1: accuracies, with II as training set and I as test

_N_o_t_e:

_A_u_t_h_o_r(_s):

     John Maindonald

_R_e_f_e_r_e_n_c_e_s:

_S_e_e _A_l_s_o:

_E_x_a_m_p_l_e_s:

     mat <- matrix(rnorm(1000), ncol=20)
     cl <- factor(rep(1:3, c(7,9,4)))
     gp.id <- divideUp(cl, nset=2)
     accTrainTest(x=mat, cl=cl, traintest=gp.id,
                  nfeatures=1:16, print.acc=TRUE)

     ## The function is currently defined as
     function(x=Golub.BM, cl=cancer.BM, traintest=gp.id,
                nfeatures=NULL, print.acc=FALSE){
         traintest <- factor(traintest)
         train <- traintest==levels(traintest)[1]
         testset <- traintest==levels(traintest)[2]
         cl1 <- cl[train]
         cl2 <- cl[testset]
         ng1 <- length(cl1)
         ng2 <- length(cl2)
         maxg <- max(c(ng1-length(unique(cl1))-2,
                       ng2-length(unique(cl2))-2))
         if(is.null(nfeatures)){
           max.features <- maxg
           nfeatures <- 1:max.features
         } else
         {
           if(max(nfeatures)>maxg)nfeatures <- nfeatures[nfeatures<=maxg]
           max.features <- max(nfeatures)
         }
         ord1 <- orderFeatures(x, cl, subset=train)[1:max.features]
         ord2 <- orderFeatures(x, cl, subset=testset)[1:max.features]
         ord <- unique(c(ord1, ord2))
         sub1 <- match(ord1, ord)
         sub2 <- match(ord2, ord)
         df1 <- data.frame(t(x[ord, train]))
         df2 <- data.frame(t(x[ord, testset]))
         acc1 <- acc2 <- numeric(max(nfeatures))
         for(i in nfeatures){
           cat(paste(i, ":", sep=""))
           df1.lda <- lda(df1[, sub1[1:i], drop=FALSE], cl1)
           hat2 <- predict(df1.lda, newdata=df2[, sub1[1:i], drop=FALSE])$class
           tab <- table(hat2, cl2)
           acc1[i] <- sum(tab[row(tab)==col(tab)])/sum(tab)
           df2.lda <- lda(df2[, sub2[1:i], drop=FALSE], cl2)
           hat1 <- predict(df2.lda, newdata=df1[, sub2[1:i], drop=FALSE])$class
           tab <- table(hat1, cl1)
           acc2[i] <- sum(tab[row(tab)==col(tab)])/sum(tab)
         }
         cat("\n")
         if(print.acc){
           print(round(acc1,2))
           print(round(acc2,2))
         }
         maxacc1 <- max(acc1)
         maxacc2 <- max(acc2)
         sub1 <- match(maxacc1, acc1)
         sub2 <- match(maxacc2, acc2)
         nextacc1 <- max(acc1[acc1<1])
         nextacc2 <- max(acc1[acc1<2])
         lower1 <- maxacc1-sqrt(nextacc1*(1-nextacc1)/ng1)
         lower2 <- maxacc2-sqrt(nextacc2*(1-nextacc2)/ng2)
         lsub1 <- min((1:ng1)[acc1>lower1])
         lsub2 <- min((1:ng2)[acc2>lower2])
         lower <- c("Best accuracy, less 1SD  ",
                    paste(paste(round(c(lower1, lower2),2), c(lsub1, lsub2),
                                sep=" ("), " features)   ", sep=""))
         best <- c("Best accuracy",
                   paste(paste(round(c(maxacc1, maxacc2),2), c(sub1, sub2),
                               sep=" ("), " features)", sep=""))
         acc.df <- cbind(lower, best)
         dimnames(acc.df) <- list(c("Training/test split",
                                    "I (training) / II (test)    ",
                                    "II (training) / I (test)    "),c("",""))
         print(acc.df, quote=FALSE)
         invisible(list(sub1.2=ord1, acc1.2=acc1, sub2.1=ord2, acc2.1=acc2))
       }

