\name{EM}
\alias{EM}
\title{EM algorithm to fit maximum likelihood estimates of trait associations with SNP haplotypes}
\description{
This function takes a dataset of haplotypes in which rows for
individuals of uncertain phase have been augmented by "pseudo-individuals"
who carry the possible multilocus genotypes consistent with
the single-locus phenotypes.  The EM algorithm is used to find
MLE's for trait associations with covariates in generalized linear models.
}
\usage{EM(form,haplos.list,baseline = "missing" ,family = binomial(),
gamma = FALSE, maxit = 50, tol = 0.001, \ldots)}
\arguments{
\item{form}{model equation in usual R format}
\item{haplos.list}{list of haplotype data from \code{\link{PreEM}}}
\item{baseline}{optional, haplotype to be used for baseline coding. Default
is the most frequent haplotype.}
\item{family}{binomial, poisson, gaussian or gamma are supported, default=binomial}
\item{gamma}{initial estimates of haplotype frequencies, default values are calculated in \code{\link{PreEM}} using standard haplotype-counting 
(i.e. EM algorithm without adjustment for non-haplotype covariates)}
\item{maxit}{maximum iterations of the EM loop, default=50}
\item{tol}{convergence tolerance in terms of the maximum difference in 
parameter estimates between interations; default=0.001}
\item{\ldots}{additional arguments to be passed to the glm function such 
as starting values for parameter estimates in the risk model}
}
\value{
\item{it}{number of iterations of the EM algorithm}
\item{beta}{estimated regression coefficients}
\item{gamma}{estimated haplotype frequencies}
\item{fits}{fitted values of the trait}
\item{wts}{final weights calculated in last iteration of the EM loop. These are estimates of the
conditional probabilities of each multilocus genotype given the observed 
single-locus genotypes.}
\item{var}{joint variance-covariance matrix of the estimated regression
coefficients and the estimated haplotype frequencies}
\item{dispersionML}{maximum likelihood estimate of dispersion parameter 
(to get the moment estimate, use \code{\link{summary.EM}})}
\item{family}{family of the generalized linear model (e.g. binomial, 
gaussian, etc.)}
\item{response}{trait value}
\item{converged}{TRUE/FALSE indicator of convergence. If the algorithm 
fails to converge, only the converged indicator is returned.}
}
\examples{
data(hypoDat)
example.preEM<-PreEM(hypoDat, 3)

names(example.preEM$haploDM)
# "h000"   "h001"   "h010"   "h011"   "h100"   "pooled"

# Logistic regression, baseline group: '001/001'

example.regr <- EM(affected ~ attr + h000+ h010 + h011 + h100 + pooled,
                     example.preEM, family=binomial())
}
\seealso{
\code{\link{PreEM}},\code{\link{summary.EM}},\code{\link{glm}},\code{\link{family}}.
}
\keyword{methods}

\eof
\name{PreEM}
\alias{PreEM}
\title{Pre-process the data before fitting it with EM} 
\description{
This function takes as an argument the original dataframe with
non-SNP and SNP data and converts the genotype data at single SNPs
(the single-locus genotypes) into haplotype data.
The rows of the original data frame should correspond to
subjects and each SNP should have two columns, one for each
allele of the single-locus genotype. The SNP data should comprise the
last 2*numSNPs columns.  If the haplotypes for a subject cannot be
inferred from his or her genotype data, "pseudo-individuals"
representing all possible haplotype combinations consistent with
the single-locus genotypes are considered.
Missing single-locus genotypes, up to a maximum of maxMissingGenos (see
below), are allowed, but subjects with missing data in more than
maxMissingGenos, or with missing non-SNP data, are removed.
Initial estimates of haplotype frequencies are then obtained using the 
EM algorithm applied to the multilocus genotype data. 
Haplotypes with frequencies below a user-specified tolerance (zero.tol)
are assumed not to exist and are removed from further consideration.
(Pseudo-individuals having haplotypes of negligible frequency are deleted and 
the column in the design matrix corresponding to that haplotype is deleted.)
For the remaining haplotypes, those with non-negligible frequency below a 
user-defined pooling tolerance (pooling.tol) are pooled into a single 
category called \"pooled\" in the design matrix for the risk model. 
However, the frequencies of each of these pooled haplotypes are 
still calculated separately. 
}

\usage{PreEM(dat,numSNPs,maxMissingGenos=1,pooling.tol = 0.05, zero.tol = 1/(2 * nrow(dat) * 10)}
\arguments{
\item{dat}{the non-SNP and SNP data as a data frame. The SNP data should comprise the last 2*numSNPs columns.}
\item{numSNPs}{number of SNPs per haplotype}
\item{maxMissingGenos}{maximum number of single-locus genotypes with missing data to allow for each subject. (Subjects with more missing data, or with missing non-SNP data are removed.) The default is 1.}
\item{pooling.tol}{pooling tolerance -- by default set to 0.05}
\item{zero.tol}{tolerance for haplotype frequencies below which haplotypes
are assumed not to exist -- by default set to 
\eqn{\frac{1}{2*N*10}}{1/(2*N*10)} where N is the number of subjects}
}
\value{
\item{haplotest}{T/F, true if some haplotypes were pooled in the risk model}
\item{initGamma}{initial estimates of haplotype frequencies}
\item{zeroFreqHaplos}{list of haplos assumed not to exist}
\item{pooledHaplos}{list of haplos pooled into a single category in the design matrix}
\item{nonHaploDM}{non-haplotype portion of the AUGMENTED data frame}
\item{haploDM}{data frame with \eqn{2^{numSNPs}}{2^numSNPs} columns scoring number of copies of each haplotype for each pseudo-individual}
\item{haploMat}{matrix with 2 columns giving haplotypes for each pseudo-individual}
\item{wt}{vector giving initial weights for each pseudo-individual for 
the EM algorithm}
\item{ID}{index for each individual in the original data frame. Note that all pseudo-individuals have the same ID value}
\item{unknown}{vector indicating whether the haplotype information was missing for each row in the augmented data}
}
\examples{
data(hypoDat)
example.preEM<-PreEM(hypoDat, numSNPs=3)

# To get the initial haplotype frequencies:
example.preEM$initGamma
#      h000       h001       h010       h011       h100       h101       h110 
#0.25179111 0.26050418 0.23606001 0.09164470 0.10133627 0.02636844 0.01081260 
#      h111 
#0.02148268 
# The '001' haplotype is estimated to be the most frequent

example.preEM$pooledHaplos
# "h101" "h110" "h111"
# These haplotypes are to be pooled in the design matrix for the risk model

names(example.preEM$haploDM)
# "h000"   "h001"   "h010"   "h011"   "h100"   "pooled"
}
\seealso{
\code{\link{EM}},\code{\link{summary.EM}}.
}
\keyword{methods}

\eof
\name{hapassoc-internal}
\title{Internal hapassoc functions}
\alias{EMnull}
\alias{EMvar}
\alias{IPhi}
\alias{IPhiGamma}
\alias{IPhiGaussian}
\alias{SPhi}
\alias{SPhiGamma}
\alias{SPhiGaussian}
\alias{codeHaploDM}
\alias{getHaplos}
\alias{getPhenos}
\alias{handleMissings}
\alias{isHetero}
\alias{isIn}
\alias{isMissing}
\alias{isMultiHetero}
\alias{makeHaploLab}
\alias{makeHaploLabN}
\alias{mlPhi}
\alias{mlPhiGamma}
\alias{momentPhiGamma}
\alias{pYgivenX}
\alias{RecodeHaplos}
\description{Internal hapassoc functions.}
\details{These are not to be called by the user or are undocumented.}
\keyword{internal}

\eof
\name{hypoDat}
\docType{data}
\alias{hypoDat}
\title{Simulated data for a hypothetical binary trait}
\description{
Simulated binary trait data used to illustrate the hapassoc package.
}
\usage{data(hypoDat)}
\format{Matrix with columns:\
\tabular{rlll}{
[,1] \tab affected \tab numeric \tab affection status (1=yes, 0=no) \cr
[,3] \tab attr \tab numeric \tab simulated quantitative attribute \cr
[,5] \tab M1.1 \tab numeric \tab the first allele of hypothetical SNP M1 \cr
[,6] \tab M1.2 \tab numeric \tab the second allele of hypothetical SNP M1 \cr
[,5] \tab M2.1 \tab numeric \tab the first allele of hypothetical SNP M2 \cr
[,6] \tab M2.2 \tab numeric \tab the second allele of hypothetical SNP M2 \cr
[,7] \tab M3.1 \tab numeric \tab the first allele of hypothetical SNP M3 \cr
[,8] \tab M3.2 \tab numeric \tab the second allele of hypotetical SNP  M3 }
}
\keyword{datasets}

\eof
\name{summary.EM}
\alias{summary.EM}
\title{Summarize results of the EM function}
\description{
Summary function for reporting the results of the EM function in a similar  
style to the lm and glm summaries.
}
\usage{summary.EM(object, \ldots)}
\arguments{
\item{object}{a list of class EM}
\item{\ldots}{additional arguments to the summary function (currently unused)}
}
\value{
\item{coefficients}{Table of estimated coefficients, standard errors and Wald tests for each variable}
\item{frequencies}{Table of estimated haplotype frequencies and standard errors}
\item{dispersion}{Estimate of dispersion parameter (Moment estimator for gamma model)}
}
\examples{
data(hypoDat)
example.preEM<-PreEM(hypoDat, 3)
example.regr <- EM(affected ~ attr + h000+ h010 + h011 + h100 + pooled,
                     example.preEM, family=binomial())

# Summarize the results:
summary.EM(example.regr) # or just summary(example.regr)

# Results:
#$coefficients
#               Estimate Std. Error      zscore   Pr(>|z|)
#(Intercept) -1.24114270  0.7820977 -1.58694079 0.11252606
#attr         0.74036920  0.2918205  2.53707057 0.01117844
#h000         1.14968352  0.5942542  1.93466627 0.05303126
#h010        -0.59318434  0.6569672 -0.90291311 0.36657201
#h011        -0.03615243  0.9161959 -0.03945928 0.96852422
#h100        -0.85329292  1.0203105 -0.83630709 0.40298217
#pooled       0.38516864  0.8784283  0.43847478 0.66104215
#
#$frequencies
#         Estimate Std. Error
#f.h000 0.26716394 0.03933158
#f.h001 0.25191674 0.03866739
#f.h010 0.21997138 0.03881578
#f.h011 0.10094795 0.02949617
#f.h100 0.09507014 0.02371878
#f.h101 0.02584918 0.01411881
#f.h110 0.01779455 0.01386080
#f.h111 0.02128613 0.01247265
#
#$dispersion
#[1] 1
}
\seealso{
\code{\link{PreEM}},\code{\link{EM}}.
}
\keyword{methods}

\eof
