.packageName <- "hapassoc"
# Filename: EM.R
# Version : $Id: EM.R,v 1.9 2004/04/10 05:49:26 mcneney Exp $

# HapAssoc- Inference of trait-haplotype associations in the presence of uncertain phase
# Copyright (C) 2003  K.Burkett, B.McNeney, J.Graham

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

########################################################################

EM<-function(form, haplos.list, baseline="missing", family=binomial(), 
             gamma=FALSE, maxit=50, tol=0.001, ...){

 environment(form)<-environment()#set envir of formula to envir w/i EM function
 column.subset <- colnames(haplos.list$haploDM)!=baseline
 hdat <- cbind(haplos.list$nonHaploDM, haplos.list$haploDM[,column.subset])
 response<-model.response(model.frame(form,data=hdat)) 
 colnames(hdat)<- c(colnames(haplos.list$nonHaploDM),
                    colnames(haplos.list$haploDM[,column.subset]))
 ID <- haplos.list$ID
 # N<-ID[length(ID)]
 N<-sum(haplos.list$wt)
 wts<-haplos.list$wt
 
 # Get the haplotype columns
 haplos<-haplos.list$haploDM
 haploMat<-haplos.list$haploMat
 allHaps<-c(haploMat[,1],haploMat[,2]) #Needed later in EM loop for wt calcs
 haplos.names<-names(haplos.list$initGamma)

 # Initial gamma values, if no gamma specified use initGamma

 if (gamma!=FALSE) {
   names(gamma)<-haplos.names
 } else {
   gamma<-haplos.list$initGamma
 }

 # Initial beta values calculated from augmented data set
 # The ... in the following call to glm allows user to pass other args to glm

 regr<-glm(form, family=family, data=hdat,...) 
 beta<-regr$coef
 fits<-regr$fitted.values
 betadiff<-1
 it<-1
 num.prob<-vector(length=nrow(hdat))
 
 # The EM loop

 while ( (it<maxit) && (betadiff>tol) ){
   
        # Vector of P(Y|X)*P(X) probability 
	# If the person is unaffected, P(Y=0) is 1-fitted value
	# otherwise it is the fitted value 
   
        # Multiplicative const for haplo probs: 1 for homo, 2 for het
        haplo.probs<-rep(1,nrow(haplos))+isMultiHetero(haplos.list)
        haplo.probs <- haplo.probs*gamma[haploMat[,1]]*gamma[haploMat[,2]]

	phi<-mlPhi(regr) #Compute ML estimate of dispersion param
        if(is.null(phi)) { #no converergence in ml estimate of phi
           break() #EM will throw a warning of non-convergence
        }
        num.prob<-pYgivenX(response,fits,phi,family)*haplo.probs

	# E step: Calculate the weights for everyone
	# Use the ID to determine the number of pseudo-individuals in the 
	# denominator probability

	for (i in 1:nrow(hdat)){               
		pseudo.index<-(ID==ID[i])
                wts[i] <- num.prob[i]/sum(num.prob[pseudo.index])
        }

	# M step: Find new estimates using GLM and weighted haplotype counts

	# Find the new betas using old betas as starting value
        regr <- glm(form, family=family, data=hdat, weights=wts,
                    control=glm.control(epsilon=1e-08),start=beta)
        betaNew<-regr$coef
   	fits<-regr$fitted.values
   	betadiff<-max(abs(beta-betaNew), na.rm=TRUE)#maximum diif
   	beta<-betaNew

	# Find the new gammas, weighted sum of haplotypes
        gamma <- tapply(c(wts,wts),allHaps,sum)/(2*N)

        it<-it+1
 }
 
 if(betadiff>tol) { #did not converge
    warning(paste("No convergence in EM in",it,"iterations\n")) 
    ans<-list(converged=FALSE)
    class(ans)<-"EM"
    return(ans)
 }


 
 #gamma is currently an array which causes problems in the calculations below
 gamma <- as.matrix(gamma) 

 EMresults <- list(beta=beta, gamma=gamma, fits=fits, wts=wts, 
                   glm.final.fit=regr,dispersion=phi, response=response)
 var.est <- EMvar(haplos.list, EMresults, family)

 ans<-list(it=it, beta=beta, gamma=gamma, fits=fits, wts=wts, ID=ID,
           var=var.est, dispersionML=phi, family=family, response=response,
           converged=TRUE)

 class(ans)<-"EM"
 return(ans)

}

## Other functions called in EM:

########################################################################

mlPhi<-function(myfit,maxit=30,tol=1e-5) {
  if(myfit$family$family=="binomial" || myfit$family$family=="poisson") {
    return(1) #dispersion set to one
  }
  if(myfit$family$family=="gaussian") {
    return(summary(myfit)$deviance/sum(myfit$prior.weights))
  }
  if(myfit$family$family=="Gamma") { #Need to do Newton-Raphson
    return(mlPhiGamma(myfit,maxit,tol))
  }
  #Else, we don't support the family passed in
  stop(paste("ML estimation of dispersion not supported for family",
                myfit$family$family))
}

########################################################################

mlPhiGamma<-function(myfit,maxit,tol) {
  
  # Need to do Newton-Raphson, use moment estimate of phi to get
  # starting value for N-R 

  dev<-myfit$deviance
  n<-sum(myfit$prior.weights)
  phiMoment<-summary(myfit)$dispersion #moment estimate of phi
  ## Looking for root of score equation as a function of x=1/phi
  ## Function f is the score equation and fp is its derivative
  f<-function(x,n,dev) { return(dev+2*n*(digamma(x)-log(x))) }
  fp<-function(x,n) { return(2*n*(trigamma(x)-1/x)) }
  diff<-1; i<-1  # initialization
  xold<-1/phiMoment #starting value for N-R 
  while(i<=maxit && diff>tol) {
    xnew<-xold - f(xold,n,dev)/fp(xold,n)
    if(is.na(xnew)) {
       warning("No convergence for ML estimate of Gamma scale parameter")
       return(NULL)
    }
    diff<-abs(xnew-xold); xold<-xnew; i<-i+1
  }
  if(i>maxit) 
    warning("No convergence for ML estimate of Gamma scale parameter")
  return(1/xnew)
}

########################################################################

pYgivenX<-function(y,mu,phi,family){

  #Calculate P(Y|X) to be used in the weights. We use P(Y|X) in 
  #expressions like P(Y_i|X_i^j)P(X_I^j)/sum_k{ P(Y_i|X_i^k)P(X_i^k) }
  #so factors that are the same for all X_i^j can be ignored.
  #The deviance resids can be used to get P(Y|X) up to constants:
  #Assuming weights of 1 
  #dev.resid = 2*phi*(l(y,phi) - l(mu,phi)) where l(mu,phi) is the log-lik
  #using mean mu and l(y,phi) is the log-likelihood in the saturated
  #model using y as the mean.
  #So   exp(-dev.resid/(2*phi)) = P(y|x)*exp(-l(y,phi))
  #Call the dev.resid function with a weight of 1 

  return(exp(-1*family$dev.resid(y,mu,wt=1)/(2*phi)))
} 


## EMvar Functions
 
########################################################################

EMvar<-function(haplos.list, results, family)  {

 # Get the results of the last iteration
 beta<-results$beta[!is.na(results$beta)]
 betanames<-names(beta)
 num.beta<-length(beta)

 gammanames<-rownames(results$gamma)
 gamma<-as.vector(results$gamma)
 num.gamma<-length(gamma)

 weights<-results$wt
 ID <- haplos.list$ID

 final.regr <- results$glm.final.fit

 # Set up the Y vectors and P vector of fitted values
 # one for the augmented data set, one for just the missing
 
 missing<-(weights<1)

 yi.full<-results$response
 fits.full<-results$fits
 
 yi.mis<-yi.full[missing]
 fits.mis<-fits.full[missing]

 # Set up the matrices needed to find the variance:

   ## Design matrices

   Xreg <- model.matrix(final.regr)
   colnames(Xreg)<-betanames
   Xreg.mis<-Xreg[missing,]

   ## Haplotype matrix
   #Instead of assuming the glm model is additive in the haplotypes, 
   #make the Xgen matrix from scratch here. Used to be
   #   Xgen<-as.matrix(haplos.list$haploDM)
   haploMat<-haplos.list$haploMat
   #each application of outer in next line returns a matrix of T/F with
   # (i,j)th element true if haploMat[i,k]==gammanames[j]; k=1,2
   #Adding these T/F matrices coerces to 1/0 s first before adding
   Xgen<-outer(haploMat[,1],gammanames,"==")+outer(haploMat[,2],gammanames,"==")

   Xgen.mis <- as.matrix(Xgen[missing,])
 
   ## Weight matrices

   W.full<-diag(weights)
   W.mis<-diag(weights[missing])

   ## likelihood derivative matrices

   H.mis<-diag(yi.mis-fits.mis)/results$dispersion

   ## Haplotype frequency and derivative matrices

   ones.mat<-matrix(1,nrow=(num.gamma-1), ncol=(num.gamma-1))
   num.haplo<-vector(length=num.gamma)
   for (i in 1:num.gamma){
	num.haplo[i]<-sum(weights*Xgen[,i])}
   der.gamma<-1/gamma
   der2.gamma<-1/gamma^2

   G1<-diag(der.gamma)
   G<-G1[,1:(num.gamma-1)]
   G[num.gamma,]<-G1[num.gamma, num.gamma]


   ## Block diagonal matrix

   ID.mis <- ID[missing]
   i <- 1
   B<-matrix(0,nrow=sum(missing), ncol=sum(missing) )

   while (i<length(ID.mis)){
	pseudo.index<-ID.mis==ID.mis[i]
	ones.vec<-rep(1, sum(pseudo.index)*sum(pseudo.index))
	block<-matrix(ones.vec, nrow=sum(pseudo.index), ncol=sum(pseudo.index))
	block.size<-sum(pseudo.index)
	B[i:(i+block.size-1), i:(i+block.size-1)]<-block
	i<-i+sum(pseudo.index)
   }

 # Calculate the complete data expected information (Ic)

   ## Top block

   Ic.reg<-solve(summary(final.regr)$cov.unscaled)/results$dispersion

   ## Middle block if dispersion was estimated -- Ic.phi requires Sphi

   Sphi<-SPhi(final.regr,results$dispersion) #will be NULL if phi not est'd
   if(!is.null(Sphi)) { 
     Ic.phi<-IPhi(final.regr,Sphi,results$dispersion)
   } else { 
     Ic.phi<-NULL 
   }

   ## Bottom block

   Ic.cov<- diag(num.haplo[1:(num.gamma-1)]*der2.gamma[1:(num.gamma-1)])+der2.gamma[num.gamma]*num.haplo[num.gamma]*ones.mat

   ## Full block diagonal matrix (combine top, middle and bottom blocks)

   if(!is.null(Ic.phi)) {
     Ic<-matrix(0,nrow=(nrow(Ic.reg)+1+nrow(Ic.cov)),
                ncol=(nrow(Ic.reg)+1+nrow(Ic.cov))) #+1 is for phi
     Ic[1:num.beta,1:num.beta]<-Ic.reg
     Ic[(num.beta+1),(num.beta+1)]<-Ic.phi
     Ic[(num.beta+2):(num.beta+num.gamma),
        (num.beta+2):(num.beta+num.gamma)]<-Ic.cov
   } else {
     Ic<-matrix(0,nrow=(nrow(Ic.reg)+nrow(Ic.cov)),
                ncol=(nrow(Ic.reg)+nrow(Ic.cov)))
     Ic[1:num.beta,1:num.beta]<-Ic.reg
     Ic[(num.beta+1):(num.beta+num.gamma-1),
        (num.beta+1):(num.beta+num.gamma-1)]<-Ic.cov
   }
     

 # Calculate the missing data information (Imis), only over missing

   ## S_beta
 
   Sbeta<-H.mis%*%Xreg.mis

   ## S_phi

   if(!is.null(Sphi)) {
     Sphi<-Sphi[missing]
   }
 
   ## S_gamma
 
   Sgamma<-Xgen.mis%*%G

   ## Score matrix

   S<-cbind(Sbeta,Sphi,Sgamma)

   ## Calculate Imis from the other matrices

   Imis<-t(S)%*%(W.mis-W.mis%*%B%*%W.mis)%*%S  

 # Calculate the information and var/cov matrix

 Info<-Ic-Imis
 varEM<-solve(Info)

 gammanames <- paste("f", gammanames, sep="")
 if(!is.null(Sphi)) {
   colnames(varEM)<-c(betanames, "phi", gammanames[1:(num.gamma-1)])
 }else {
   colnames(varEM)<-c(betanames, gammanames[1:(num.gamma-1)])
 }
 rownames(varEM)<-colnames(varEM)
   

 return(varEM)

}


## Other functions called in EMvar:

########################################################################

SPhi<-function(myfit,phi) {
  if(myfit$family$family=="binomial" || myfit$family$family=="poisson") {
    return(NULL) #dispersion set to one so score not defined
  }
  if(myfit$family$family=="gaussian") {
    return(SPhiGaussian(myfit,phi))
  }
  if(myfit$family$family=="Gamma") { 
    return(SPhiGamma(myfit,phi))
  }
}

########################################################################

SPhiGaussian<-function(myfit,phi) {
  return( (myfit$y-myfit$fitted)^2/(2*phi^2) - 1/(2*phi) )
}

########################################################################

SPhiGamma<-function(myfit,phi) {
  x<-1/phi
  y<-myfit$y
  mu<-myfit$fitted.values
  return( (digamma(x)-log(x)+(y-mu)/mu-log(y/mu))*x^2 )
}

########################################################################

IPhi<-function(myfit,score,phi) {
  if(myfit$family$family=="binomial" || myfit$family$family=="poisson") {
    return(NULL) #dispersion set to one so phi not estimated
  }
  if(myfit$family$family=="gaussian") {
    return(IPhiGaussian(myfit,score,phi))
  }
  if(myfit$family$family=="Gamma") {
    return(IPhiGamma(myfit,score,phi))
  }
}

########################################################################

IPhiGaussian<-function(myfit,score,phi) {
  wts<-myfit$prior.weights
  n<-sum(wts)
  return( 2*(t(score)%*%wts)/phi + n/(2*phi^2) )
}

########################################################################

IPhiGamma<-function(myfit,score,phi) {
  x<-1/phi
  wts<-myfit$prior.weights
  n<-sum(wts)
  return( 2*x*(t(score)%*%wts) + n*x^4*(trigamma(x) - 1/x) )
}

  
# Filename: PreEM.R
# Version : $Id: PreEM.R,v 1.2 2004/03/24 23:55:15 mcneney Exp $

# HapAssoc- Inference of trait-haplotype associations in the presence of uncertain phase
# Copyright (C) 2003  K.Burkett, B.McNeney, J.Graham

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

########################################################################

PreEM <- function(dat,numSNPs,maxMissingGenos=1, pooling.tol=0.05, 
                        zero.tol=1/(2*nrow(dat)*10)){

  haplos.list<-RecodeHaplos(dat,numSNPs,maxMissingGenos)

  haplotest<-FALSE; ID.check<-rep(FALSE,length(haplos.list$ID))
  
  # Starting matrices, some rows/columns will be deleted if there
  # are missing haplotypes
  
  newhaploDM <- haplos.list$haploDM
  newnonHaploDM <- haplos.list$nonHaploDM
  nonHaploDMnames<-names(haplos.list$nonHaploDM) 
  newhaploMat <- haplos.list$haploMat
  newID <- haplos.list$ID

  #Run the usual EM algorithm using just the haplo information to
  #get estimates of haplotype frequencies and initial weights
  emres<-EMnull(haplos.list)
  newwt<-emres$wts
  
  zero.ind<-emres$gamma<zero.tol #flag haplos with zero frequency
  initGamma<-emres$gamma[!zero.ind]
  haplos.names<-names(initGamma)<-names(emres$gamma[!zero.ind])
  zeroFreqHaplos<-names(emres$gamma[zero.ind])

  if(sum(zero.ind)>0) { #then non-existent haplos need to be removed
    haplotest<-TRUE
    newhaploDM<-newhaploDM[,!zero.ind]

    #We only want rows that sum to two - others must have involved
    #haplotypes with estimated frequency of zero
    finalMatInd<- (rowSums(newhaploDM) == 2)

    newhaploDM<-newhaploDM[finalMatInd,]
    newhaploMat<-newhaploMat[finalMatInd,]
    newnonHaploDM<-data.frame(newnonHaploDM[finalMatInd,])
    names(newnonHaploDM)<-nonHaploDMnames
    newwt<-newwt[finalMatInd]
    newID<-newID[finalMatInd]
      
    # Re-calculate weights -- 2 loops!! there must be a better way
    uniqueIDs<-unique(newID)
    IDsum<-rep(0,length(uniqueIDs))
    for(i in 1:length(uniqueIDs)) {
      IDsum[i]<-sum(newwt[newID==uniqueIDs[i]])
    }
    for(i in 1:length(newwt)) {  
      newwt[i] <- newwt[i]/IDsum[uniqueIDs==newID[i]]
    }
  }

  pooling.ind<-initGamma<pooling.tol #flag rare haplos 
  pooled.haplos<-"no pooled haplos"
  if(sum(pooling.ind)>1) { #then pooling to be done *in design matrix only*
    pooled.haplos<-haplos.names[pooling.ind]
    pooledDMcol<-rowSums(newhaploDM[,pooling.ind])
    newhaploDM<-data.frame(newhaploDM[,!pooling.ind],pooled=pooledDMcol)
  }
  
  return(list(nonHaploDM=newnonHaploDM, haploDM=newhaploDM,
              haploMat=newhaploMat, wt=newwt, ID=newID, 
              haplotest=haplotest, initGamma=initGamma,
              zeroFreqHaplos=zeroFreqHaplos,pooledHaplos=pooled.haplos))
}


## Other functions called in CheckHaplos

########################################################################


isMissing <- function(test.vec){
  
  flag <- vector(length=max(test.vec))
  names(flag) <- c(1:length(flag))

  for (i in 1:length(flag)){if (sum(test.vec==i)==0){flag[i] <-TRUE} }
  return(flag)
}
  
isIn <- function(vec,vecElements) {
  #Return a vector of same length as vec. Element i of returned vector is
  #FALSE if vec[i] is not in the vector vecElements and TRUE if it is.
  flag<-rep(FALSE,length(vec))
  for(i in 1:length(vecElements)) {
    flag<-(flag | vec==vecElements[i])
  }
  return(flag)
}
  

EMnull<-function(haplos.list, gamma=FALSE, maxit=100, tol=1/(2*sum(haplos.list$wt)*100)){

 haploMat <- haplos.list$haploMat

 ID <- haplos.list$ID
 # N<-ID[length(ID)]
 N<-sum(haplos.list$wt)
 wts<-haplos.list$wt
 
 # Initial gamma values, if no gamma specified, calculate gamma values based
 # on augmented dataset.

 # We should avoid using the design matrix from an additive risk model to
 # help with haplotype frequency calculations. The following used to use
 # gamma <- (t(haplos)%*%wts)/(2*N)   where haplos is the additive model
 # design matrix as a computational trick to get haplotype frequencies.
 # Now use the tapply function and the haplotypes in haploMat to sum wts .

 if (gamma==FALSE){
    allHaps<-c(haploMat[,1],haploMat[,2])
    allWts<-c(wts,wts)
    gamma<-tapply(allWts,allHaps,sum)/(2*N)
 }

 gammadiff<-1
 it<-1
 num.prob<-vector(length=nrow(haploMat))
 
 # The EM loop

 while ( (it<maxit) && (gammadiff>tol) ){
   
        # multiplicative constant = 2 for heterozyg 1 for homozyg
        haplo.probs<-rep(1,nrow(haploMat))+isMultiHetero(haplos.list)
        haplo.probs <- haplo.probs*gamma[haploMat[,1]]*gamma[haploMat[,2]]
        num.prob<-haplo.probs

	# E step: Calculate the weights for everyone
	# Use the ID to determine the number of pseudo-individuals in the 
	# denominator probability

	for (i in 1:nrow(haploMat)){               
		pseudo.index<-ID==ID[i]
                wts[i] <- num.prob[i]/sum(num.prob[pseudo.index])
        }

	# M step: Find new estimates using weighted haplotype counts
        allWts<-c(wts,wts)
	gammaNew <- tapply(allWts,allHaps,sum)/(2*N)
   	gammadiff<-max(abs(gamma-gammaNew), na.rm=TRUE)#maximum diff
   	gamma<-gammaNew

        it<-it+1
 }
 if(gammadiff>tol) 
   warning(paste("no convergence in EMnull after ",maxit,"iterations\n"))
 
 results <- list(gamma=gamma, wts=wts)

 return(results)

}

isMultiHetero<-function(haplos.list) {
  return(as.numeric(haplos.list$haploMat[,1] != haplos.list$haploMat[,2]))
}
# HapAssoc- Inference of trait-haplotype associations in the presence of uncertain phase
# Copyright (C) 2003  K.Burkett, B.McNeney, J.Graham

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

########################################################################

RecodeHaplos<-function(dat,numSNPs,maxMissingGenos=1,logriskmodel="additive") {

  # Split dat into nonSNP and SNP data
  ncols<-ncol(dat)
  nonsnpcols<-ncols-2*numSNPs
  snpcols<-ncols-nonsnpcols
  nonSNPdat<-dat[,1:(ncols-2*numSNPs)]
  nhdmnames=names(dat)[1:(ncols-2*numSNPs)] #save the col names for the output - they will disappear when we typecast to matrix below

  nonSNPdat <- as.matrix(nonSNPdat)#as.data.frame(nonSNPdat)
  SNPdat<-as.matrix(dat[,(ncols-2*numSNPs+1):ncols])

  # Pre-process input to deal with missing data
  preProcDat<-handleMissings(SNPdat,nonSNPdat,numSNPs,maxMissingGenos)
  # Note: we could get rid of these 2 as.matrix(..) typecasts if we fix up handleMissings sometime in the future - MP.nov.2003
  SNPdat<-as.matrix(preProcDat$SNPdat)
  nonSNPdat<-as.matrix(preProcDat$nonSNPdat)
  ID<-preProcDat$ID

  # Initialization and setup:
  row.names(nonSNPdat)<-as.character(1:nrow(nonSNPdat))
  haploLabs<-makeHaploLabN(0:(2^numSNPs-1),numSNPs=numSNPs)
  numHaplos<-nrow(haploLabs)

  # Build data frames to hold nonSNP design matrix, haplotype data design
  # matrix and initial weights. The haplo design matrix will have a column
  # corresponding to each possible haplotype.
  wt<-NULL
  nonHaploDM<-matrix(nrow=(nrow(SNPdat)*2^(numSNPs-1)),ncol=nonsnpcols) #part of design matrix corresponding to non-haplo data
  nhdmidx<-1 #row index initializer for nonHaploDM matrix
  haploDM<-matrix(nrow=(nrow(SNPdat)*2^(numSNPs-1)),ncol=numHaplos) #part of design matrix corresponding to haplotypes
  hdmidx<-1 #row index initializer for haploDM matrix
  haploMat<-matrix(nrow=(nrow(SNPdat)*2^(numSNPs-1)),ncol=snpcols) #the actual haplotypes for each (pseudo-) individual
  hmatidx<-1 #row index initalizer for haploMat matrix
  ID.vec<-matrix(nrow=(nrow(SNPdat)*2^(numSNPs-1)),ncol=1) #to keep track of pseudo-individuals original ID
  ididx<-1 #row index for ID.vec

  heteroVec<-rep(NA,numSNPs) #Initializer

  # Main loop to construct design matrix-- do SNP and non-SNP data separately
  # Loop over subjects and if a subject's geno data does not determine
  # haplos, enumerate all consistent haplos and add pseudo-individuals
  # to the design matrices for each possible haplo specification

  for(i in 1:nrow(SNPdat)){

    # Function isHetero returns a vector heterovec of T's and F's
    # describing whether the person is heterozygous at each locus
    # This has now been inlined below (function isHetero to be removed??)
    for(j in 1:numSNPs){heteroVec[j]<-(.subset(SNPdat,i,2*j-1)!=.subset(SNPdat,i,2*j))}
    numHetero<-sum(heteroVec)

    # The rows of matrix myhaplos are possible haplotype combinations
    # for the ith subject. The first column of the matrix has a character
    # string of 0's and 1's for a binary number describing the first
    # haplotype and the second column has a string for a binary number
    # describing the second haplotype.

    myhaplos<-getHaplos(SNPdat[i,],heteroVec)
    numHaploComb<-nrow(myhaplos)

    for(j in 1:numHaploComb) { #loop over haplo combos consistent w/ obs data
      haploDM[hdmidx,]<-codeHaploDM(myhaplos[j,],haploLabs,model=logriskmodel)
      hdmidx<-hdmidx+1
    }
    for(j in 1:numHaploComb){
      nonHaploDM[nhdmidx,]<-nonSNPdat[i,]
      nhdmidx<-nhdmidx+1
      }
    for(j in 1:numHaploComb){
      ID.vec[ididx]<-ID[i]
      ididx<-ididx+1
    }
    for(j in 1:nrow(myhaplos)) {
    	haploMat[hmatidx,]<-myhaplos[j,]
	hmatidx<-hmatidx+1
    }

  }  #end for loop over subjects

  haploDM<-haploDM[1:(hdmidx-1),]
  haploMat<-haploMat[1:(hmatidx-1),]
  ID.vec<-ID.vec[1:(ididx-1),]
  nonHaploDM<-nonHaploDM[1:(nhdmidx-1),]

  #Now renormalize the weights to sum to 1 before returning. Only necessary if 
  #there was missing SNP data. With missing SNPs handleMissings creates 
  #copies of a person, one for each possible complete set of single-locus
  #genotypes. E.g. if there is one allele of one SNP missing for a person,
  #that person will be copied into two "people" each with weight 1.
  for(i in 1:(ididx-1)) {
    wt[i]<-1/sum(ID.vec==ID.vec[i])
  }

  #Only need to return the columns of haploDM that have non-zero column sums:
  myColSums<-colSums(haploDM)
  haploDM<-haploDM[,myColSums>0]

  haploDM<-data.frame(haploDM)

  #Here we re-format our  haploMat to be compatible with the old output format and store it
  #in HaploMat2.  It *may* be that in the future, haploMat2 could be dropped if the
  #computationally-friendly format of haploMat is preferred over the older format in whatever
  #code is calling RecodeHaplos and making use of the output. -Matt
  n<-ncol(haploMat)
  haploMat2<-matrix(nrow=nrow(haploMat),ncol=2)
  haploMat2[,1]<-paste("h",haploMat[,1],sep="")
  haploMat2[,2]<-paste("h",haploMat[,n/2+1],sep="")
  for(i in 2:(n/2)) {
	haploMat2[,1]<-paste(haploMat2[,1],haploMat[,i],sep="")
	haploMat2[,2]<-paste(haploMat2[,2],haploMat[,i+n/2],sep="")
  }
  #Need to protect columns of haploMat2 from being coerced into factors
  #with the I() function. BM Jan/04
  haploMat2<-data.frame(haplo1=I(haploMat2[,1]),haplo2=I(haploMat2[,2]))

  #Put column names just like the old format. -Matt
  hdmnames<-makeHaploLab(0:(2^numSNPs-1),numSNPs)
  #Only the labels with >0 column sums though
  names(haploDM)<-paste("h",hdmnames[myColSums>0],sep="")

  nonHaploDM<-data.frame(nonHaploDM)
  #Put column names just like the old format. -Matt
  names(nonHaploDM)<-nhdmnames

  return(list(nonHaploDM=nonHaploDM,haploDM=haploDM,haploMat=haploMat2,
              wt=wt, ID=ID.vec))
}


## Other functions called in RecodeHaplos:

########################################################################
handleMissings<-function(SNPdat,nonSNPdat,numSNPs,maxMissingGenos)
{
  temnonSNPdat<-na.omit(nonSNPdat)
  omittedRows<-attr(temnonSNPdat,"na.action") # Which rows were removed
  nonSNPdat<-data.frame(temnonSNPdat)
  if(!is.null(omittedRows)) {
    warning(paste(length(omittedRows),
                  "subjects removed because of missing nongenetic data\n"))
    SNPdat<-SNPdat[-omittedRows,] # Remove these from SNPdat too
  }

  numMissingGenos<-rep(0,nrow(SNPdat)) # Count SNP genos with missing data
  for(i in 1:numSNPs) {
    numMissingGenos <- numMissingGenos+
                         (is.na(SNPdat[,(2*i-1)])|is.na(SNPdat[,2*i]))
  }

  # Remove people with too many missing genotypes
  ind<-numMissingGenos<=maxMissingGenos
  if(sum(!ind)>0) {
    warning(paste(sum(!ind),
            "subjects with missing data in more than",maxMissingGenos,
            "genotype(s) removed\n"))
    nonSNPdat<-data.frame(nonSNPdat[ind,])
    SNPdat<-data.frame(SNPdat[ind,])
    numMissingGenos<-numMissingGenos[ind]
  }

  ID <- c(1:nrow(SNPdat)) # initial ID's
  missingGenos<-(numMissingGenos>0)
  if(any(missingGenos)) {
    # First save copies of those with missing data
    temSNPdat<-data.frame(SNPdat[missingGenos,])
    temnonSNPdat<-data.frame(nonSNPdat[missingGenos,])
    temID<-ID[missingGenos]

    # Reduce SNPdat and nonSNPdat to people who have no missing data
    SNPdat<-data.frame(SNPdat[!missingGenos,]) 
    nonSNPdat<-data.frame(nonSNPdat[!missingGenos,])
    ID<-ID[!missingGenos]

    # Now augment the SNP and nonSNP data by enumerating sets of complete 
    # SNP genos (call these "complete phenos") consistent with the
    # observed phenotypes.
    for(i in 1:sum(missingGenos)) {
      missingVec<-is.na(temSNPdat[i,])
      completePhenos<-getPhenos(temSNPdat[i,],numSNPs,missingVec)
      numPhenos<-nrow(completePhenos)
      for(j in 1:numPhenos) { #loop over complete phenos consistent w/ obs data
        SNPdat<-rbind(SNPdat,completePhenos[j,])
        nonSNPdat<-rbind(nonSNPdat,temnonSNPdat[i,])
        ID <- c(ID,temID[i])
      }
    } # end loop over subjects with missing data
  } #end if(any(missingGenos))

  return(list(SNPdat=SNPdat,nonSNPdat=nonSNPdat,ID=ID))
}

getPhenos<-function(snps,numSNPs,missingVec) {

  # Inefficient but simple approach: consider both a 0 or 1 for each NA,
  # e.g for one locus with NA/NA --> 0/0, 0/1, 1/0, 1/1.
  # Then order the alleles       --> 0/0, 0/1, 0/1, 1/1
  # Then remove duplicates       --> 0/0, 0/1, 1/1

  knownAlleles<-snps[!missingVec]

  # Enumerate all possible values for missings
  k<-sum(missingVec)
  # Can use makeHaploLab to enumerate all possible alleles for missing vals
  misAlleles<-makeHaploLab(0:(2^(k)-1),numSNPs=k)
  misAlleles<-matrix(as.numeric(unlist(strsplit(misAlleles,split=""))),
                         ncol=k,byrow=TRUE) #turn labels into a numeric matrix
  numPhenos<-nrow(misAlleles)

  myPhenos<-matrix(NA,nrow=numPhenos,ncol=length(snps))
  myPhenos[,!missingVec]<-matrix(rep(knownAlleles,numPhenos),
                                 ncol=length(knownAlleles),
                                 byrow=TRUE)
  myPhenos[,missingVec]<-misAlleles

  # Now order alleles
  for(i in 1:numSNPs) {
    ind<-myPhenos[,(2*i-1)]>myPhenos[,2*i] #these are the 1/0 genos
    myPhenos[ind,(2*i-1)]<-0; myPhenos[ind,2*i]<-1
  }

  # now reduce to unique rows with built-in unique.array function
  myPhenos<-unique.array(myPhenos)

  return(myPhenos)
}

makeHaploLab<-function(x,numSNPs=2) {

  #Function used to construct labels for SNP haplos; e.g. with 3 SNPs we want
  #labels 000, 001, ..., 111 which in each case is n plus one of
  #the numbers 0,1,...,(2^3-1) represented in base2.
  #Takes x as a base 10 number and returns haplo label
  #For example if numSNPs is 3 then x=0 would lead to the string "000",
  #x=1 would lead to "001" and so on. Function can take x as a vector so
  #an example of usage is: mylabs<-makeHaploLab(0:(2^3-1),numSNPs=3)
  #
  #Basic idea is to fill in digits of the base2 numbers left to right.
  #Example: numSNPs=3, x=7. Then x= 1*2^2 + 1*2^1 + 1*2^0 = 111 in base2.
  #Start by filling in the first digit, then second , then third.

  ans<-"" #start label as blank

  for(i in (numSNPs-1):0) {
    digit<-floor(x/2^i)
    ans<-paste(ans,as.character(digit),sep="") #update answer
    x<-x-digit*2^i
  }
  return(ans)
}

# A version of makeHaploLab that returns a numeric vector instead of a string of haplotype pairs
makeHaploLabN<-function(x,numSNPs=2) {
	len<-length(x)
	ans<-matrix(0,nrow=len,ncol=numSNPs)

	for(i in (numSNPs-1):0){
		digit<-floor(x/2^i)
		ans[,numSNPs-i]<-digit
		x<-x-digit*2^i
	}
	return(ans)
}


########################################################################

isHetero<-function(SNPvec,numSNPs) {

  #Function to take a vector of SNP data for a person and figure out
  #which loci person is heterozygous for. Returns logical vector.
  #Assumes genotypes are in pairs in the vector, e.g.
  #SNPvec = (M1.allele1,M1.allele2,M2.allele1,M2.allele2,...)

  if(length(SNPvec)/2 != numSNPs)
    stop("SNPvec not compatible with numSNPs\n")

  ans<-rep(NA,numSNPs)

  for(i in 1:numSNPs) {
    #marker i's data are in elements 2*i-1 and 2*i of the SNPvec
    ans[i]<-(SNPvec[2*i-1]!=SNPvec[2*i])
  }
  return(ans)
}
########################################################################


getHaplos<-function(SNPvec,heteroVec){
	nloci<-length(heteroVec)
	k<-sum(heteroVec)

	if(k<=1) {
		haplo<-matrix(nrow=1,ncol=length(SNPvec))
		mid<-length(SNPvec)/2
		haplo[1,(1:mid)]<-SNPvec[2*(1:nloci)-1]
		haplo[1,(mid+1):length(SNPvec)]<-SNPvec[2*(1:nloci)]
	}
	else {
		heteroStates1<-makeHaploLabN(0:(2^(k-1)-1),numSNPs=k)
		heteroStates2<-1-heteroStates1
		haplo<-matrix(NA,ncol=2*nloci,nrow=2^(k-1))
		hvec1<-rep(FALSE,2*nloci)
		hvec2<-rep(FALSE,2*nloci)
		hvec1[1:nloci]<-heteroVec
		hvec2[(nloci+1):(2*nloci)]<-heteroVec
		haplo[,hvec1]<-heteroStates1
		haplo[,hvec2]<-heteroStates2
		if((nloci-k)>0) {
			homoStates<-SNPvec[2*(1:nloci)][!heteroVec]
			homoStates<-matrix(rep(homoStates,2^(k-1)),ncol=(nloci-k),byrow=TRUE)
			hvec1<-rep(FALSE,2*nloci) ##was len
			hvec2<-rep(FALSE,2*nloci) ##was len
			hvec1[1:nloci]<-!heteroVec
			hvec2[(nloci+1):(2*nloci)]<-!heteroVec
			haplo[,hvec1]<-homoStates
			haplo[,hvec2]<-homoStates
		}
	}
	return(haplo)
}

##########################################################################


codeHaploDM<-function(haplos,haploLabs,model="additive"){

  #Nice try, but this is broken for >2 snps, see below for general implementation --Matt
  #ans<-((haplos[1]==haploLabs[,1]) & (haplos[2]==haploLabs[,2]))
  #ans<-ans+((haplos[3]==haploLabs[,1]) & (haplos[4]==haploLabs[,2]))

  #Not as "tricky" as above, but much cleaner to understand:
  n=length(haplos)
  nsnp=ncol(haploLabs)

  ans1<-t(haplos[1:(n/2)]==t(haploLabs))
  ans2<-t(haplos[(n/2+1):n]==t(haploLabs))
  ans11<-ans1[,1]
  ans22<-ans2[,1]
  for(i in 2:nsnp) {
	ans11<-ans11&ans1[,i]
	ans22<-ans22&ans2[,i]
  }
  ans=ans11+ans22

  return(ans)
}

# Filename: summaryEM.R
# Version : $Id: summaryEM.R,v 1.5 2004/01/25 20:21:13 mcneney Exp $

# HapAssoc- Estimation of trait-haplotype associations in the presence of uncertain phase
# Copyright (C) 2003  K.Burkett, B.McNeney, J.Graham

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

########################################################################

summary.EM<-function(object, ...) {

family<-object$family$family
if(family=="Gamma"){
 #For a Gamma model, use a moment rather than ML estimate of phi.
 #The ML estimate is overly sensitive to round-off errors in small 
 #responses and to departures from the Gamma model (Mc&N pp295-296).
  dispersion=momentPhiGamma(object) 
  object$var<-object$var*dispersion/(object$dispersionML)
} else {
  dispersion=object$dispersionML
}

numbeta<-length(object$beta)
if(family=="Gamma"|family=="gaussian") #remove row and col for dispersion
   object$var<-object$var[-(numbeta+1),-(numbeta+1)]
numfreqs<-length(object$gamma)
coef.table<-cbind(object$beta,sqrt(diag(object$var[1:numbeta,1:numbeta])))
#Now calculate the se for the last frequency estimator
varfreqs<-object$var[(numbeta+1):(numbeta+numfreqs-1),(numbeta+1):(numbeta+numfreqs-1)]
cvec<-rep(-1,numfreqs-1)
varlast<-t(cvec)%*%varfreqs%*%cvec
freq.table<-cbind(object$gamma, c(sqrt(diag(varfreqs)),sqrt(varlast)))

#Compute z-scores for regression coefficients
coef.table<-cbind(coef.table,coef.table[,1]/coef.table[,2])

#Now two-sided p-values. 
coef.table<-cbind(coef.table,1-pchisq(coef.table[,3]^2,df=1))
dimnames(coef.table)<-list(names(object$beta),
                           c("Estimate","Std. Error","zscore","Pr(>|z|)"))
dimnames(freq.table)<-list(
      paste("f.",dimnames(object$gamma)[[1]],sep=""),
      c("Estimate","Std. Error"))

return(list(coefficients=coef.table,frequencies=freq.table,
            dispersion=dispersion))
}
## Other functions called in summary.EM

########################################################################

momentPhiGamma<-function(object) {
  ans<-0
  uniqueID<-unique(object$ID)
  for(i in 1:length(uniqueID)) {
    ind<-(object$ID==uniqueID[i]) #identify pseudo-individuals for uniqueID[i]
    myy<-(object$response[ind])[1] #response is the same for all such ps-indiv
    mywt<-object$wts[ind]; myfit<-object$fits[ind]
    mubar<-sum(mywt*myfit)
    a<-sum(mywt*(myfit-mubar)^2)
    b<-sum(mywt*myfit^2)
    ans<-ans+((myy-mubar)^2-a)/b
  }
  n<-sum(object$wts); p<-length(object$beta)+length(object$gamma)
  return(ans/(n-p))
}
