#
# $Id: HWE.chisq.Rd,v 1.1 2003/03/07 14:32:48 warnesgr Exp $
#

\name{HWE.chisq}
\alias{HWE.chisq}
\alias{HWE.chisq.genotype}

\title{Perform Chi-Square Test for Hardy-Weinberg Equilibrium}

\description{
  Test the null hypothesis that Hardy-Weinberg equilibrium holds using
  the Chi-Square method.
}

\usage{
HWE.chisq(x, ...)
\method{HWE.chisq}{genotype}(x, simulate.p.value=TRUE, B=10000, ...)
          
}
\arguments{
  \item{x}{genotype or haplotype object.}
  \item{simulate.p.value}{a logical value indicating whether the p-value
    should be computed using simulation instead of using the
    \eqn{\chi^2}{Chi-Square} approximation. Defaults to \code{TRUE}.}
  \item{B}{Number of simulation iterations to use when
    \code{simulate.p.value=TRUE}. Defaults to 10000.}
  \item{...}{ optional parameters passed to \code{chisq.test}}
}
\details{
  This function generates a 2-way table of allele counts, then calls
  \code{\link{chisq.test}} to compute a p-value for Hardy-Weinberg
  Equilibrium.  By default, it requests that the test statistic be
  computed using a simulation/permutation method.  When
  \code{simulate.p.value} is false, it computes the test statistic using
  the Chi-Square distribution, with the approproate degrees of freedom.
}
\value{
  An object of class \code{htest}.
}

\seealso{
  \code{\link{HWE.exact}},
  \code{\link{HWE.test}},
  \code{\link{diseq}},
  \code{\link{diseq.ci}},
  \code{\link{allele}},
  \code{\link{chisq.test}},
  \code{\link[boot]{boot}},
  \code{\link[boot]{bootci}}
}


\examples{
\testonly{
set.seed(4657613)
}
example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

HWE.chisq(g1)
# compare with
HWE.exact(g1)
# and 
HWE.test(g1)

three.data   <- c(rep("A/A",8),
                  rep("C/A",20),
                  rep("C/T",20),
                  rep("C/C",10),
                  rep("T/T",3))

g3  <- genotype(three.data)
g3

HWE.chisq(g3, B=10000)


}
\keyword{ misc }
%\keyword{genetics}

\eof
% $Id: HWE.exact.Rd,v 1.3 2003/03/07 14:52:27 warnesgr Exp $
%
% $Log: HWE.exact.Rd,v $
% Revision 1.3  2003/03/07 14:52:27  warnesgr
% - Modified HWE.exact to return an object of class 'htest'
% - Noted this change in the man file and added HWE.chisq to list of
%   links and to example code.
%
% Revision 1.2  2003/02/03 16:13:56  warnesgr
% - Fixed typos and R CMD check warnings.
% - Updated version number
% - Removed 'data' directory to fix new R CMD check warning.
%
% Revision 1.1  2002/12/02 16:16:44  warnesgr
%
% - Added HWE.exact.Rd.
%
%

\name{HWE.exact}
\alias{HWE.exact}
\title{Exact Test of Hardy-Weinberg Equilibrium for 2-Allele Markers}
\description{
  Exact test of Hardy-Weinberg Equilibrium for 2 Allele Markers.
}
\usage{
HWE.exact(x)
}
\arguments{
  \item{x}{ Genotype object }
}
%\details{
%}
\value{
  Object of class 'htest'.
}
\references{
    Emigh TH. (1980) "Comparison of tests for Hardy-Weinberg Equilibrium",
    Biometrics, 36, 627-642.
  }
\author{
  David Duffy \email{davidD@qimr.edu.au} with modifications by Gregory
  R. Warnes \email{gregory\_r\_warnes@groton.pfizer.com}
  }
\note{ This function only works for genotypes with exactly 2 alleles.}
\seealso{
  \code{\link{HWE.chisq}},
  \code{\link{HWE.test}},
  \code{\link{diseq}},
  \code{\link{diseq.ci}}
}

\examples{
example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

HWE.exact(g1)
# compare with
HWE.chisq(g1)


\testonly{
set.seed(465764)
}

g2 <- genotype(sample( c("A","C"), 100, p=c(100,10), rep=TRUE),
               sample( c("A","C"), 100, p=c(100,10), rep=TRUE) )
HWE.exact(g2)

}
\keyword{ misc }
%\keyword{genetics}


\eof
#
# $Id: HWE.test.Rd,v 1.13 2004/05/25 19:40:02 warnesgr Exp $
#
# $Log: HWE.test.Rd,v $
# Revision 1.13  2004/05/25 19:40:02  warnesgr
# Many fixex.
#
# Revision 1.12  2003/03/07 14:47:53  warnesgr
#
# - Added links to HWE.chisq, HWE.exact, genotype to see also seciton.
# - Slighlty modified example code.
#
# Revision 1.11  2003/03/07 14:32:48  warnesgr
#
# - Created HWE.chisq, HWE.chisq.genotype and corresponding man page.
# - Moved computation of Chisquare test for HWE from HWE.test.genotype
#   to HWE.chisq.genotype.
# - Added option (on by default) to compute the exact p-value using HWE.exact.
#   This is on by default when nallele=2
#
# Revision 1.10  2002/11/27 15:32:20  warnesgr
# Correct spelling errors and typos.
#
# Revision 1.9  2002/11/12 19:58:44  warnesgr
# - Changes to remove warnings generated by 'R CMD check'.
#
# Revision 1.8  2002/11/08 21:07:28  warnesgr
#
# - DESCRIPTION: Updated version number and date
# - TODO: Updated todo list.
#
# Revision 1.7  2002/09/24 01:32:19  warnesgr
# - 'Un-genericized' diseq()
# - Moved documentation of diseq() and diseq from HWE.test.Rd to diseq.Rd
# - Cleaned up HWE.test.Rd and diseq.Rd
# - Fixed broken class check in diseq() and diseq.ci()
# - Removed allele.count.default() -- this will force the user to
#   explicitly call 'genotype' on the data to use allele.count().
# - Added zzz.R to require package 'boot'
#
# Revision 1.6  2002/09/24 00:02:01  WarnesGR
# - Moved code that computed D-hat to diseq() and diseq.genotype() in diseq.R.
# - Added diseq.ci() to compute confidence interval for D-hat.
# - Added code to call diseq() and diseq.ci() from HWE.test()
# - Added arguments to HWE.test() and print.HWE.test() to control these new features
# - Added text to HWE.test.Rd documenting these new functions and arguments
#
# Revision 1.5  2002/06/18 19:38:41  warnesgr
#
# Changes to fix problems reported by R CMD check.
#
# Revision 1.4  2002/02/14 12:48:56  warnes
#
# - Added 'keywords' to HWE.test
# - Create undocumented.Rd to satisfy documentation requirements for
#   objects not otherwise included in documentation
# - Add 'alias' commands noting that 'genotype' and '[.haplotype' are
#   documented here.
#
# Revision 1.3  2001/06/28 19:15:29  warnes
# - Updated to match v1.4 of HWE.test.R.
#
# - Added CVS id and log fields to file.
#
#

\name{HWE.test}
\alias{HWE.test}
\alias{HWE.test.genotype}
\alias{HWE.test.data.frame}
\alias{print.HWE.test}

\title{Estimate Disequilibrium and Test for Hardy-Weinberg Equilibrium}

\description{
  Estimate disequilibrium parameter and test the null hypothesis that 
  Hardy-Weinberg equilibrium holds. 
}

\usage{
HWE.test(x, ...)
\method{HWE.test}{genotype}(x, exact = nallele(x)==2, simulate.p.value=!exact,
         B=10000, conf=0.95, ci.B=1000, ... )
\method{HWE.test}{data.frame}(x, ..., do.Allele.Freq=TRUE, do.HWE.test=TRUE)
\method{print}{HWE.test}(x, show=c("D","D'","r"), ...)
}
\arguments{
  \item{x}{genotype or haplotype object.}
  \item{exact}{a logical value indicated whether the p-value should be
    computed using the exact method, which is only available for 2
    allele genotypes.}
  \item{simulate.p.value}{a logical value indicating whether the p-value
    should be computed using simulation instead of using the
    \eqn{\chi^2}{Chi-Square} approximation. Defaults to \code{TRUE}.}
  \item{B}{Number of simulation iterations to use when
    \code{simulate.p.value=TRUE}. Defaults to 10000.}
  \item{conf}{Confidence level to use when computing the confidence
    level for D-hat.  Defaults to 0.95, should be in (0,1). }
  \item{ci.B}{Number of bootstrap iterations to use when computing the
    confidence interval. Defaults to 1000.}
%   \item{ci.type}{Method of calculating the confidence interval using the
%     bootstrap sample.  Defaults to \code{"basic"}.  See 
%     \code{\link[boot]{bootci}} for details.}
  \item{show}{a character vector containing the names of HWE test
    statistics to display from the set of "D", "D'", and "r".}
  \item{...}{ optional parameters passed to \code{HWE.test} (data.frame
    method) or \code{chisq.test} (base method).}
  \item{do.Allele.Freq}{logicial indication whether to summarize allele
    frequencies.}
  \item{do.HWE.test}{logicial indication whether to perform HWE tests}
}
\details{
  HWE.test calls \code{\link{diseq}} to computes the Hardy-Weinberg
  (dis)equilibrium statistics D, D', and r (correlation coefficient).
  Next it calls \code{\link{diseq.ci}} to compute a bootstrap confidence
  interval for these estimates.  Finally, it calls
  \code{\link{chisq.test}} to compute a p-value for Hardy-Weinberg
  Equilibrium using a simulation/permutation method.

  Using bootstrapping for the confidence interval and simulation for the
  p-value avoids reliance on the assumptions the underlying Chi-square
  approximation.  This is particularly important when some allele pairs
  have small counts.

  For details on the definition of D, D', and r, see the help page for
  \code{\link{diseq}}.
}
\value{
  An object of class \code{HWE.test} with components
  \item{diseq}{A \code{\link{diseq}} object providing details on the
    disequilibrium estimates.}
  \item{ci}{A \code{\link{diseq.ci}} object providing details on the
    bootstrap confidence intervals for the disequilibrium estimates.}
  \item{test}{A \code{\link[ctest]{htest}} object providing details on the
    permutation based Chi-square test.}
  \item{call}{function call used to creat this object.}
  \item{conf, B, ci.B, simulate.p.value}{values used for these arguments.}
}
\author{ Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com }
  }

\seealso{
  \code{\link{genotype}},
  \code{\link{diseq}},
  \code{\link{diseq.ci}},
  \code{\link{HWE.chisq}},
  \code{\link{HWE.exact}},
}


\examples{
\testonly{
set.seed(4657613)
}
example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

HWE.test(g1)

#compare with
diseq(g1)
diseq.ci(g1)
HWE.chisq(g1)
HWE.exact(g1)


three.data   <- c(rep("A/A",8),
                  rep("C/A",20),
                  rep("C/T",20),
                  rep("C/C",10),
                  rep("T/T",3))

g3  <- genotype(three.data)
g3

HWE.test(g3, ci.B=10000)


}
\keyword{ misc }
%\keyword{genetics}

\eof
\name{LD}
\alias{LD}
\alias{LD.genotype}
\alias{LD.data.frame}
\title{Pairwise linkage disequilibrium between genetic markers.}
\description{
  Compute pairwise linkage disequilibrium between genetic markers
}
\usage{
LD(g1, ...)
\method{LD}{genotype}(g1,g2,...)
\method{LD}{data.frame}(g1,...)
}
\arguments{
  \item{g1}{ genotype object or dataframe containing genotype objects }
  \item{g2}{ genotype object (ignored if g1 is a dataframe) }
  \item{\dots}{ optional arguments (ignored) }
}
\details{
  Linkage disequilibrium (LD) is the non-random association of
  marker alleles and can arise from marker proximity or from selection
  bias.
  
  \code{LD.genotype} estimates the extent of LD for a single pair of
  genotypes.  \code{LD.data.frame} computes LD for all pairs of
  genotypes contained in a data frame.  Before starting,
  \code{LD.data.frame} checks the class and number of alleles of each
  variable in the dataframe.  If the data frame contains non-genotype
  objects or genotypes with more or less than 2 alleles, these will be
  omitted from the computation and a warning will be generated.

  Three estimators of LD are computed:
  \itemize{

    \item{D}{ raw difference in frequency between the
              observed number of AB pairs and the expected number:
      
      \deqn{%
 	D = p_{AB} - p_A p_B %
      }{%
 	D = p(AB) - p(A)*p(B) %
      }
      
    }
    \item{D'}{ scaled D spanning the range [-1,1] 
      
      
      \deqn{D' = \frac{D}{D_{max} } }{D' = D / Dmax}
      
      where, if D > 0:
      \deqn{%
  	D_{max} = \min( p_A p_b, p_a p_B )  %
      }{%
  	Dmax = min( p(A)p(b), p(a)p(B) )   %
      } 
      or if D < 0:
      \deqn{%
  	D_{max} = \max{ -p_A p_B, -p_a p_b }  %
      }{%
  	Dmax = max( -p(A)p(B), -p(a)p(b) )  %
      }
    }
    
    \item{r}{ correlation coefficient between the markers
      
      \deqn{%
 	r = \frac{-D}{\sqrt( p_A * p_a * p_B * p_b  )} %
      }{%
 	r = -D / sqrt( p(A) * p(a) * p(B) * p(b) ) %
      }
    }
  }
  
  where
  \itemize{
    \item{-}{ \eqn{p_A}{p(A)} is defined as the observed probability of
      allele 'A' for marker 1, }
    \item{-}{ \eqn{p_a=1-p_A}{p(a) = 1-p(a)} is defined as the observed probability of
      allele 'a' for marker 1, }
    \item{-}{\eqn{p_B}{p(B)} is defined as the observed probability of
      allele 'B' for marker 2, and }
    \item{-}{\eqn{p_b=1-p_B}{p(b) = 1- p(b)} is defined as the observed probability of
      allele 'b' for marker 2, and }
    \item{-}{\eqn{p_{AB}}{p(AB)} is defined as the probability of
      the marker allele pair 'AB'. }
  }

  For genotype data, AB/ab cannot be distinguished from
  aB/Ab. Consequently, we estimate \eqn{p_{AB}}{p(AB)} using maximum
  likelihood and use this value in the computations.
  }
\value{

  \code{LD.genotype} returns a 5 element list:
    \item{call}{the matched call}
    \item{D}{Linkage disequilibrium estimate}
    \item{Dprime }{Scaled linkage disequilibrium estimate}
    \item{corr}{Correlation coefficient} 
    \item{nobs}{Number of observations}
    \item{chisq}{Chi-square statistic for linkage
      equilibrium (i.e., D=D'=corr=0)}
    \item{p.value}{Chi-square p-value for marker independence}

  \code{LD.data.frame} returns a list with the same elements, but each
  element is a matrix where the upper off-diagonal elements contain the
  estimate for the corresponding pair of markers.  The other matrix
  elements are \code{NA}.
}
%\references{ ~put references to the literature/web site here ~ }
\author{ Gregory R. Warnes \email{gregory\_r\_warnes@groton.pfizer.com} }
\seealso{ \code{\link{genotype}}, \code{\link{HWE.test}}  }
\examples{

g1 <- genotype( c('T/A',    NA, 'T/T',    NA, 'T/A',    NA, 'T/T', 'T/A',
                  'T/T', 'T/T', 'T/A', 'A/A', 'T/T', 'T/A', 'T/A', 'T/T',
                     NA, 'T/A', 'T/A',   NA) )

g2 <- genotype( c('C/A', 'C/A', 'C/C', 'C/A', 'C/C', 'C/A', 'C/A', 'C/A',
                  'C/A', 'C/C', 'C/A', 'A/A', 'C/A', 'A/A', 'C/A', 'C/C',
                  'C/A', 'C/A', 'C/A', 'A/A') )


g3 <- genotype( c('T/A', 'T/A', 'T/T', 'T/A', 'T/T', 'T/A', 'T/A', 'T/A',
                  'T/A', 'T/T', 'T/A', 'T/T', 'T/A', 'T/A', 'T/A', 'T/T',
                  'T/A', 'T/A', 'T/A', 'T/T') )

# Compute LD on a single pair

LD(g1,g2)

# Compute LD table for all 3 genotypes

data <- makeGenotypes(data.frame(g1,g2,g3))
LD(data)
}
\keyword{misc}

\eof
\name{binsearch}
\alias{binsearch}
\title{Binary Search}
\description{
  Search within a specified range to locate an integer parameter which
  results in the the specified monotonic function obtaining a given value.
}
\usage{
binsearch(fun, range, ..., target = 0, lower = ceiling(min(range)),
          upper = floor(max(range)), maxiter = 100, showiter = FALSE)
}
\arguments{
  \item{fun}{Monotonic function over which the search will be performed.}
  \item{range}{2-element vector giving the range for the search.}
  \item{\dots}{Additional parameters to the function \code{fun}.}
  \item{target}{Target value for \code{fun}.  Defaults to 0.}
  \item{lower}{Lower limit of search range. Defaults to \code{min(range)}.}
  \item{upper}{Upper limit of search range. Defaults to \code{max(range)}.}
  \item{maxiter}{ Maximum number of search iterations. Defaults to 100.}
  \item{showiter}{ Boolean flag indicating whether the algorithm state
    should be printed at each iteration. Defaults to FALSE.}
}
\details{
  This function implements an extension to the standard binary search
  algorithm for searching a sorted list.  The algorithm has been
  extended to cope with cases where an exact match is not possible, to
  detect whether that the function may be monotonic increasing or
  decreasing and act appropriately, and to detect when the target value
  is outside the specified range.

  The algorithm initializes two variable \code{lo} and
  \code{high} to the extremes values of \code{range}.  It then generates
  a new value \code{center} halfway between \code{lo} and \code{hi}.  If
  the value of \code{fun} at \code{center} exceeds \code{target}, it
  becomes the new value for \code{lo}, otherwise it becomes the new
  value for \code{hi}.  This process is iterated until \code{lo} and
  \code{hi} are adjacent.  If the function at one or the other equals
  the target, this value is returned, otherwise \code{lo}, \code{hi},
  and the function value at both are returned.

  Note that when the specified target value falls between integers, the
  \em{two} closest values are returned.  If the specified target falls
  outside of the specified \code{range}, the closest endpoint of the
  range will be returned, and an warning message will be generated.  If
  the maximum number if iterations was reached, the endpoints of the
  current subset of the range under consideration will be returned.
}
\value{
  A list containing:
  \item{call}{How the function was called.}
  \item{numiter}{The number of iterations performed}
  \item{flag }{One of the strings,  "Found", "Between Elements",
    "Maximum number of iterations reached", "Reached lower boundary", or
    "Reached upper boundary."}
  \item{where}{One or two values indicating where the search
    terminated.}
  \item{value}{Value of the function \code{fun} at the values of
    \code{where}.}
}
%\references{ ~put references to the literature/web site here ~ }
\author{Gregory R. Warnes \email{gregory\_r\_warnes@groton.pfizer.com} }
\note{This function often returns two values for \code{where} and
  \code{value}.  Be sure to check the \code{flag} parameter to see what
  these values mean.}
\seealso{ \code{\link[base]{optim}}, \code{\link[base]{optimize}},
  \code{\link[base]{uniroot}} }
\examples{

### Toy examples

# search for x=10
binsearch( function(x) x-10, range=c(0,20) )

# search for x=10.1
binsearch( function(x) x-10.1, range=c(0,20) )

### Classical toy example

# binary search for the index of 'M' among the sorted letters
fun <- function(X) ifelse(LETTERS[X] > 'M', 1,
                          ifelse(LETTERS[X] < 'M', -1, 0 ) )

binsearch( fun, range=1:26 ) 
# returns $where=13
LETTERS[13]

### Substantive example, from genetics

# Determine the necessary sample size to detect all alleles with
# frequency 0.07 or greater with probability 0.95.
power.fun <- function(N) 1 - gregorius(N=N, freq=0.07)$missprob

binsearch( power.fun, range=c(0,100), target=0.95 )

# equivalent to
gregorius( freq=0.07, missprob=0.05)
}
\keyword{optimize}
\keyword{programming}

\eof
\name{ci.balance}
\alias{ci.balance}
\title{Experimental Function to Correct Confidence Intervals At or Near
  Boundaries of the Parameter Space by 'Sliding' the Interval on the
  Quantile Scale.}
\description{Experimental function to correct confidence intervals at or near
  boundaries of the parameter space by 'sliding' the interval on the
  quantile scale.}
\usage{
ci.balance(x, est, confidence=0.95, alpha=1-confidence, minval, maxval,
           na.rm=TRUE)
}
\arguments{
  \item{x}{Bootstrap parameter estimates.}
  \item{est}{Observed value of the parameter.}
  \item{confidence}{Confidence level for the interval. Defaults to
    0.95.}
  \item{alpha}{Type I error rate (size) for the interval. Defaults to
               1-\code{confidence}.}  
  \item{minval}{A numeric value specifying the lower bound of the
                parameter space. Leave unspecified (the default) if
                there is no lower bound.}
  \item{maxval}{A numeric value specifying the upper bound of the
                parameter space. Leave unspecified (the default) if
                there is no upper bound.}
  \item{na.rm}{ logical. Should missing values be removed?}
}
\details{
  EXPERIMENTAL FUNCTION:

  This function attempts to compute a proper \code{conf}*100\%
  confidence interval for parameters at or near the boundary of the
  parameter space using bootstrapped parameter estimates by 'sliding'
  the confidence interval on the quantile scale.
  
  This is accomplished by attempting to place a \code{conf} *100\%
  interval symmetrically *on the quantile scale* about the observed
  value.  If a symmetric interval would exceed the observed data at the
  upper (lower) end, a one-sided interval is computed with the upper
  (lower) boundary fixed at the the upper (lower) boundary of the
  parameter space.
}
\value{
  A list containing:
  \item{ci}{A 2-element vector containing the lower and upper confidence
    limits.  The names of the elements of the vector give the actual
    quantile values used for the interval or one of the character
    strings "Upper Boundary" or "Lower Boundary".}
  \item{overflow.upper, overflow.lower}{The number of elements beyond
    those observed that would be needed to compute a symmetric (on the
    quantile scale) confidence interval.}
  \item{n.above, n.below}{The number of bootstrap values which are above
    (below) the observed value.}
  \item{lower.n, upper.n}{The index of the value used for the endpoint
    of the confidence interval or the character string "Upper Boundary"
    ("Lower Boundary").}
}
\author{ Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com }
}

\seealso{
  \code{\link[boot]{boot}},
  \code{\link[bootstrap]{bootstrap}},
  Used by \code{\link{diseq.ci}}.
}

\examples{
# These are nonsensical examples which simply exercise the
# computation. See the code to diseq.ci for a real example.
#
# FIXME: Add real example using boot or bootstrap.  

set.seed(7981357)
x <- abs(rnorm(100,1))
ci.balance(x,1, minval=0)
ci.balance(x,1)

x <- rnorm(100,1)
x <- ifelse(x>1, 1, x)
ci.balance(x,1, maxval=1)
ci.balance(x,1)
}
\keyword{misc}
%\keyword{genetics}


\eof
# $Id: diseq.Rd,v 1.7 2003/05/27 18:45:52 warnesgr Exp $
#
# $Log: diseq.Rd,v $
# Revision 1.7  2003/05/27 18:45:52  warnesgr
#
# - Fix typos, update arguments to match changes to code
#
# Revision 1.6  2003/04/07 20:30:19  warnesgr
#
# - Flipped sign of D, D', r.  Now D matches sign given in Weir's book,
#   and r has the proper sign.
#
# Revision 1.5  2002/11/27 15:32:20  warnesgr
# Correct spelling errors and typos.
#
# Revision 1.4  2002/11/12 19:58:45  warnesgr
# - Changes to remove warnings generated by 'R CMD check'.
#
# Revision 1.3  2002/11/12 05:31:21  warnesgr
# - Fix mismatches between documentation and code that we generating
#   warning messages.
#
# Revision 1.2  2002/11/08 19:53:57  warnesgr
#
# - Moved ci.balance() to a separate file and created a documentation file for it.
# - Modified ci.balance to better annotate when it uses boundary values.
# - Modified diseq.ci to better provide warning message when the number of
#   alleles is greater than 3.
#
# Revision 1.1  2002/09/24 01:32:19  warnesgr
# - 'Un-genericized' diseq()
# - Moved documentation of diseq() and diseq from HWE.test.Rd to diseq.Rd
# - Cleaned up HWE.test.Rd and diseq.Rd
# - Fixed broken class check in diseq() and diseq.ci()
# - Removed allele.count.default() -- this will force the user to
#   explicitly call 'genotype' on the data to use allele.count().
# - Added zzz.R to require package 'boot'
#
#

\name{diseq}
\alias{diseq}
\alias{diseq.table}
\alias{diseq.genotype}
\alias{diseq.ci}
\alias{print.diseq}
\title{Estimate or Compute Confidence Interval for the Single-Marker Disequilibrium}
\description{
  Estimate or compute confidence interval for single-marker disequilibrium.
}
\usage{
diseq(x, ...)
\method{print}{diseq}(x, show=c("D","D'","r"), ...)
diseq.ci(x, R=1000, conf=0.95, correct=TRUE, na.rm=TRUE, ...)
}
\arguments{
  \item{x}{genotype or haplotype object.}
  \item{show}{a character value or vector indicating which
    disequilibrium measures should be displayed.  The default is to show
    all of the available measures.}
  \item{conf}{Confidence level to use when computing the confidence
    level for D-hat.  Defaults to 0.95, should be in (0,1). }
  \item{R}{Number of bootstrap iterations to use when computing the
    confidence interval. Defaults to 1000.}
  \item{correct}{See details.}
  \item{na.rm}{ logical. Should missing values be removed?}
  \item{...}{ optional parameters passed to \code{boot.ci}
    (\code{diseq.ci}) or ignored.}
}
\details{
  For a single-gene marker, \code{diseq} computes the Hardy-Weinberg
  (dis)equilibrium statistic D, D', and r (the correlation coefficient)
  for each pair of allele values, as well as an overall value for
  each. \code{print.diseq} displays the contents of a \code{diseq}
  object. \code{diseq.ci} computes a bootstrap confidence interval for
  this estimate.

  For each allele pair,
  \itemize{
    \item{D}{ is defined as the half of the raw difference
      in frequency between
      the observed number of heterozygotes and the expected number:
      
      \deqn{%
 	D = \frac{1}{2} ( p_{ij} - 2  p_i p_j ) %
      }{%
 	D = 1/2 * ( p(ij) - 2 * p(i)*p(j) )  %
      }
      
    }
    \item{D'}{ rescales D to span the range [-1,1] 
      
      
      \deqn{D' = \frac{D}{D_{max} } }{D' = D / Dmax}
      
      where, if D > 0:
      \deqn{%
  	D_{max} = \min( p_i, p_j ) - p_{ij}  %
      }{%
  	Dmax = min( p(i),p(j) ) - p(ij)   %
      } 
      or if D < 0:
      \deqn{%
  	D_{max} = p_{ij} %
      }{%
  	Dmax = p(ij)  %
      }
    }
    
    \item{r}{ is the correlation coefficient between the two alleles ignoring all
      other alleles, and can be computed by
      
      \deqn{%
 	r = \frac{-D}{\sqrt( p_i * (1-p_i) p(j) (1-p_j ) )} =
	\frac{-D}{p_i p_j}%
      }{%
 	r = -D / sqrt( p(i)*(1-p(i)) * p(j)*(1-p(j)) ) = -D / p(i)p(j) %
      }
    }
    
  }
  
  where
  \itemize{
    \item{-}{ \eqn{p_i}{p(i)} defined as the observed probability of
      allele 'i', }
    \item{-}{\eqn{p_j}{p(j)} defined as the observed probability of
      allele 'j', and }
    \item{-}{\eqn{p_{ij}}{p(ij)} defined as the observed probability of
      the allele pair 'ij'. }
  }
  
  When there are more than two alleles, the summary values for these
  statistics are obtained by computing a weighted average of the
  absolute value of each allele pair, where the weight is determined by
  the expected frequency. For example:

   \deqn{%
     D_{overall} = \sum_{i \ne j}  |D_{ij}| * p_{ij} % 
   }{%
     D.overall = sum |D(ij)| * p(ij) %
   }

  Bootstrapping is used to generate confidence interval in order to
  avoid reliance on parametric assumptions, which will not hold for
  alleles with low frequencies (e.g. \eqn{D'} following a a Chi-square 
  distribution).  

  See the function \code{\link[genetics]{HWE.test}} for testing
  Hardy-Weinberg Equilibrium, \eqn{D=0}.
  
}
\value{
  \code{diseq} returns an object of class \code{diseq} with components
  \itemize{
    \item{data}{2-way table of allele pair counts}
    \item{D.hat}{matrix giving the observed count, expected count,
      observed - expected difference, and estimate of disequilibrium for
      each pair of alleles as well as an overall disequilibrium value.}
    \item{call}{function call used to create this object}
  }

  \code{diseq.ci} returns an object of class \code{\link[boot]{bootci}}
}
\author{ Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com }
  }

\seealso{
  \code{\link{genotype}},
  \code{\link{HWE.test}},
  \code{\link[boot]{boot}},
  \code{\link[boot]{bootci}}
}

\examples{
\testonly{
set.seed(7981357)
}
example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

diseq(g1)
diseq.ci(g1)
HWE.test(g1)  # does the same, plus tests D-hat=0

three.data   <- c(rep("A/A",8),
                  rep("C/A",20),
                  rep("C/T",20),
                  rep("C/C",10),
                  rep("T/T",3))

g3  <- genotype(three.data)
g3

diseq(g3)
diseq.ci(g3, ci.B=10000, ci.type="bca")
}
\keyword{misc}
%\keyword{genetics}

\eof
% $Id: genotype.Rd,v 1.21 2004/05/25 19:40:02 warnesgr Exp $
%
% %Log$
%

\name{genotype}
\alias{genotype}
\alias{haplotype}
\alias{is.genotype}
\alias{is.haplotype}
\alias{as.genotype}
\alias{as.haplotype}

%%
\alias{print.genotype}
\alias{==.genotype}
\alias{==.haplotype}
\alias{[.genotype}
\alias{[.haplotype}
\alias{[<-.genotype}
\alias{[<-.haplotype}
\alias{heterozygote.genotype}
\alias{homozygote.genotype}
\alias{print.allele.count}
\alias{print.allele.genotype}
\alias{allele.count.genotype}
%\alias{allele.genotype}
%\alias{allele.names}
\alias{as.genotype.allele.count}
\alias{as.genotype.character}
\alias{as.genotype.default}
\alias{as.genotype.factor}
\alias{as.genotype.genotype}
\alias{as.genotype.haplotype}
\alias{as.genotype.table}
\alias{nallele}

\title{Genotype or Haplotype Objects.}
\description{
  \code{genotype} creates a genotype object.

  \code{haplotype} creates a haplotype object.

  \code{is.genotype} returns \code{TRUE} if \code{x} is of class
  \code{genotype}

  \code{is.haplotype} returns \code{TRUE} if \code{x} is of class
  \code{haplotype}

  \code{as.genotype} attempts to coerce its argument into an object of
  class \code{genotype}.

  \code{as.genotype.allele.count} converts allele counts (0,1,2) into
  genotype pairs ("A/A", "A/B", "B/B").

  \code{as.haplotype} attempts to coerce its argument into an object of
  class \code{haplotype}.

  \code{nallele} returns the number of alleles in an object of class
  \code{genotype}.
  
}
\usage{
  genotype(a1, a2=NULL, alleles=NULL, sep="/", remove.spaces=TRUE,
           reorder = c("yes", "no", "default", "ascii", "freq"),
           allow.partial.missing=FALSE, locus=NULL)

  haplotype(a1, a2=NULL, alleles=NULL, sep="/", remove.spaces=TRUE,
           reorder="no", allow.partial.missing=FALSE, locus=NULL)

  is.genotype(x)

  is.haplotype(x)

  as.genotype(x, ...)

  as.genotype.allele.count(x, alleles=c("A","B"), ... )

  as.haplotype(x, ...)

  print.genotype(x, ...)

  nallele(x)
}

\arguments{
  \item{x}{ either an object of class \code{genotype} or
    \code{haplotype} or an object to be converted to class \code{genotype} or
    \code{haplotype}.}
  \item{a1,a2}{ vector(s) or matrix containing two alleles
    for each individual. See details, below.}
  \item{alleles}{ names (and order if \code{reorder="yes"}) of possible
    alleles.}
  \item{sep}{ character separator or column number used to divide
    alleles when \code{a1} is a vector of strings where each string
    holds both alleles. See below for details.}
  \item{remove.spaces}{ logical indicating whether spaces and tabs will
    be removed from a1 and a2  before processing.}
  \item{reorder}{how should alleles within an individual be reordered.
    If \code{reorder="no"}, use the order specified by the alleles
    parameter.  If \code{reorder="freq"} or
    \code{reorder="yes"}, sort alleles within each individual by
    observed frequency.  If \code{reorder="ascii"}, reorder alleles in
    ASCII order (alphabetical, with all upper case before
    lower case). The default value for \code{genotype}
    is \code{"freq"}.  The default value for \code{haplotype} is
    \code{"no"}.
  }
  \item{allow.partial.missing}{logical indicating whether one allele is
    permitted to be missing.  When set to \code{FALSE} both alleles
    are set to \code{NA} when either is missing.}
  \item{locus}{ object of class locus, gene, or marker, holding
    information about the source of this genotype.}
  \item{...}{optional arguments}
}
\details{
  Genotype objects hold information on which gene or marker alleles were
  observed for different individuals.  For each individual, two alleles
  are recorded.

  The genotype class considers the stored alleles to be unordered, i.e., "C/T"
  is equivalent to "T/C".  The haplotype class considers the order of the
  alleles to be significant so that "C/T" is distinct from "T/C".

  When calling \code{genotype} or \code{haplotype}:

  \itemize{
  \item If only \code{a1} is provided and is a character vector, it is
    assumed that each element encodes both alleles. In this case, if
    \code{sep} is a character string, \code{a1} is assumed to be coded
    as "Allele1<sep>Allele2".  If \code{sep} is a numeric value, it is
    assumed that character locations \code{1:sep} contain allele 1 and
    that remaining locations contain allele 2.

  \item If \code{a1} is a matrix, it is assumed that column 1 contains
  allele 1 and column 2 contains allele 2.

  \item If \code{a1} and \code{a2} are both provided, each is assumed to
  contain one allele value so that the genotype for an individual is
  obtained by \code{paste(a1,a2,sep="/")}.

  }

  If \code{remove.spaces} is TRUE, (the default) any whitespace
  contained in \code{a1} and \code{a2} is removed when the genotypes are
  created.  If whitespace is used as the separator, (eg "C C", "C T",
  ...), be sure to set remove.spaces to FALSE.

  When the alleles are explicitly specified using the \code{alleles}
  argument, all potential alleles not present in the list will be
  converted to \code{NA}.
  
  NOTE: \code{genotype} assumes that the order of the alleles is not important
  (E.G., "A/C" == "C/A").  Use class \code{haplotype} if order is significant.
}
\value{
  
  The genotype class extends "factor" and haplotype extends
  genotype. Both classes have the following attributes:
  \item{levels}{ character vector of possible genotype/haplotype values
    stored coded by \code{paste( allele1, "/", allele2, sep="")}.}
  \item{allele.names}{ character vector of possible alleles. For a SNP,
    these might be c("A","T").   For a variable length dinucleotyde
    repeat this might be c("136","138","140","148"). }
  \item{allele.map}{ matrix encoding how the factor levels correspond to
    alleles.  See the source code to \code{allele.genotype()} for how to
    extract allele values using this matrix.  Better yet, just use
    \code{allele.genotype()}. }
    
}
%\references{ ~put references to the literature/web site here ~ }
\author{Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com} and
  Friedrich Leisch.}
\seealso{
%  \code{\link{genotype}},
  \code{\link{HWE.test}},
  \code{\link{allele}},
  \code{\link{homozygote}},
  \code{\link{heterozygote}}, 
  \code{\link{carrier}},
  \code{\link{summary.genotype}},
  \code{\link{allele.count}}
  \code{\link{locus}}
  \code{\link{gene}}
  \code{\link{marker}}
  }

\examples{
# several examples of genotype data in different formats
example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

example.data2  <- c("C-C","C-T","C-C","T-T","C-C",
                    "C-C","C-C","C-C","T-T","")
g2  <- genotype(example.data2,sep="-")
g2


example.nosep  <- c("DD", "DI", "DD", "II", "DD",
                    "DD", "DD", "DD", "II", "")
g3  <- genotype(example.nosep,sep="")
g3

example.a1 <- c("D",  "D",  "D",  "I",  "D",  "D",  "D",  "D",  "I",  "")
example.a2 <- c("D",  "I",  "D",  "I",  "D",  "D",  "D",  "D",  "I",  "")
g4  <- genotype(example.a1,example.a2)
g4

example.mat <- cbind(a1=example.a1, a1=example.a2)
g5  <- genotype(example.mat)
g5

example.data5  <- c("D   /   D","D   /   I","D   /   D","I   /   I",
                    "D   /   D","D   /   D","D   /   D","D   /   D",
                    "I   /   I","")
g5  <- genotype(example.data5,rem=TRUE)
g5

# show how genotype and haplotype differ
data1 <- c("C/C", "C/T", "T/C")
data2 <- c("C/C", "T/C", "T/C")

test1  <- genotype( data1 )
test2  <- genotype( data2 )

test3  <-  haplotype( data1 )
test4  <-  haplotype( data2 )

test1==test2
test3==test4

test1=="C/T"
test1=="T/C"

test3=="C/T"
test3=="T/C"

## "Messy" example

m3  <-  c("D D/\t   D D","D\tD/   I",  "D D/   D D","I/   I",
          "D D/   D D","D D/   D D","D D/   D D","D D/   D D",
          "I/   I","/   ","/I")

genotype(m3)
summary(genotype(m3))

m4  <-  c("D D","D I","D D","I I",
          "D D","D D","D D","D D",
          "I I","   ","  I")

genotype(m4,sep=1)
genotype(m4,sep=" ",remove.spaces=FALSE)
summary(genotype(m4,sep=" ",remove.spaces=FALSE))

m5  <-  c("DD","DI","DD","II",
          "DD","DD","DD","DD",
          "II","   "," I")
genotype(m5,sep=1)
haplotype(m5,sep=1,remove.spaces=FALSE)

g5  <- genotype(m5,sep="")
h5  <- haplotype(m5,sep="")

heterozygote(g5) 
homozygote(g5)    
carrier(g5,"D")

g5[9:10]  <- haplotype(m4,sep=" ",remove=FALSE)[1:2]
g5

g5[9:10]
allele(g5[9:10],1)
allele(g5,1)[9:10]

# drop unused alleles 
g5[9:10,drop=TRUE]
h5[9:10,drop=TRUE]

# Convert allele.counts into genotype

x <- c(0,1,2,1,1,2,NA,1,2,1,2,2,2)
g <- as.genotype.allele.count(x, alleles=c("C","T") )
g

}
\keyword{ misc }
%\keyword{genetics}

\eof
\name{gregorius}
\alias{gregorius}
\title{Probability of Observing All Alleles with a Given Frequency in a
  Sample of a Specified Size.}
\description{
  Probability of observing all alleles with a given frequency in a
  sample of a specified size.
}
\usage{
gregorius(freq, N, missprob, tol = 1e-10, maxN = 10000, maxiter=100, showiter = FALSE)
}
\arguments{
  \item{freq}{(Minimum) Allele frequency (required)}
  \item{N}{Number of sampled genotypes}
  \item{missprob}{Desired maximum probability of failing to observe an allele.}
  \item{tol}{Omit computation for terms which contribute less than this value.}
  \item{maxN}{Largest value to consider when searching for N.}
  \item{maxiter}{Maximum number of iterations to use when searching for N.}
  \item{showiter}{Boolean flag indicating whether to show the iterations
    performed when searching for N.}
}
\details{
  If \code{freq} and \code{N} are provided, but \code{missprob} is omitted,
  this function computes the probability of failing to observe all alleles
  with true underlying frequency \code{freq} when \code{N} diploid
  genotypes are sampled.  This is accomplished using the sum provided in
  Corollary 2 of Gregorius (1980), omitting terms which contribute less
  than \code{tol} to the result.

  When \code{freq} and \code{missprob} are provide, but \code{N} is
  omitted. A binary search on the range of [1,\code{maxN}] is performed
  to locate the smallest sample size, \code{N}, for which the
  probability of failing to observe all alleles with true
  underlying frequency \code{freq} is at most \code{missprob}.  In this
  case, \code{maxiter} specifies the largest number of iterations to use
  in the binary search, and \code{showiter} controls whether the
  iterations of the search are displayed.
}
\value{
  A list containing the following values:
  \item{call}{   Function call used to generate this object.}
  \item{method}{ One of the strings, "Compute missprob given N and freq",
    or "Determine minimal N given missprob and freq", indicating which
    type of computation was performed.}
  \item{retval$freq}{ Specified allele frequency.}
  \item{retval$N}{    Specified or computed sample size. }
  \item{retval$missprob}{ Computed probability of failing to observe all
  of the alleles with frequency \code{freq}. }
}
\references{
  Gregorius, H.R. 1980. The probability of losing an allele when
  diploid genotypes are sampled.  Biometrics 36, 643-652.
}
\note{
  This code produces sample sizes that are slightly larger than those
  given in table 1 of Gregorius (1980).  This appears to be due to
  rounding of the computed \code{missprob}s by the authors of that
  paper.
  }
    
\author{ Code submitted by David Duffy \email{davidD@qumr.edu.au},
  substantially enhanced by Gregory R. Warnes
  \email{gregory\_r\_warnes@groton.pfizer.com}. }
%\seealso{ ~~objects to SEE ALSO as \code{\link{~~fun~~}}, ~~~ }
\examples{

# Compute the probability of missing an allele with frequency 0.15 when
# 20 genotypes are sampled:
gregorius(freq=0.15, N=20)

# Determine what sample size is required to observe all alleles with true
# frequency 0.15 with probability 0.95
gregorius(freq=0.15, missprob=1-0.95)


}
\keyword{misc}

\eof
\name{homozygote}
\alias{homozygote}
\alias{heterozygote}
\alias{carrier}
\alias{carrier.genotype}
\alias{allele}
\alias{allele.count}
\alias{allele.names}
\title{Extract Features of Genotype objects}
\description{
  \code{homozygote} creates an vector of logicals that are true when the
  alleles of the corresponding observation are the identical.
  
  \code{heterozygote } creates an vector of logicals that are true when the
  alleles of the corresponding observation differ.
  
  \code{carrier} create a logical vector or matrix of logicals
  indicating whether the specified alleles are present.
  
  \code{allele.count} returns the number of copies of the specified
  alleles carried by each observation.
  
  \code{allele} extract the specified allele(s) as a character vector
  or a 2 column matrix.

  \code{allele.names} extract the set of allele names.
 }
\usage{
homozygote(x,  allele.name, ...)
heterozygote(x, allele.name, ...)
carrier(x, allele.name, ...)
\method{carrier}{genotype}(x, allele.name=allele.names(x),
        any=!missing(allele.name), na.rm=FALSE, ...)
allele.count(x, allele.name=allele.names(x),any=!missing(allele.name),
             na.rm=FALSE)
allele(x, which=c(1,2) )
allele.names(x)
}
\arguments{
  \item{x}{ \code{genotype} object }
  \item{\dots}{ optional parameters (ignored) }
  \item{allele.name}{ character value or vector of allele names}
  \item{any}{ logical value.  When \code{TRUE}, a single count or
    indicator is returned by combining the results for all of the
    elements of \code{allele}. If \code{FALSE} separate counts or
    indicators should be returned for each element of
    \code{allele}.  Defaults to \code{FALSE} if \code{allele} is
    missing. Otherwise defaults to \code{TRUE}.}
  \item{na.rm}{ logical value indicating whether to remove missing
    values.  When true, any \code{NA} values will be replaced by
    \code{0} or \code{FALSE} as appropriate.  Defaults to \code{FALSE}.}
  \item{which}{ selects which allele to return. For first allele use
    \code{1}.  For second allele use \code{2}.  For both (the default)
    use \code{c(1,2)}.}
      
}

\details{
  When the \code{allele.name} argument is given, heterozygote and
  homozygote return \code{TRUE} if \emph{exactly} one or both alleles,
  respectively, match the specified allele.name.
}

\value{
  \code{homozygote} and \code{heterozygote } return a vector of
  logicals.

  \code{carrier} returns a logical vector if only one allele is
  specified, or if \code{any} is \code{TRUE}.  Otherwise, it returns
  matrix of logicals with one row for each element of \code{allele}.

  
  \code{allele.count} returns a  vector of counts if only one allele is
  specified, or if \code{any} is \code{TRUE}.  Otherwise, it returns
  matrix of counts with one row for each element of \code{allele}.
  
  \code{allele} returns a character vector when one allele is
  specified.  When 2 alleles are specified, it returns a 2 column
  character matrix.

  \code{allele.names} returns a character vector containing the set of
  allele names.
  
  
}
\author{ Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com} }

\seealso{
  \code{\link{genotype}},
  \code{\link{HWE.test}},
%  \code{\link{allele}},
%  \code{\link{homozygote}},
%  \code{\link{heterozygote}}, 
%  \code{\link{carrier}},
  \code{\link{summary.genotype}},
  %  \code{\link{allele.count}}
  \code{\link{locus}}
  \code{\link{gene}}
  \code{\link{marker}}
  }


  \examples{

example.data   <- c("D/D","D/I","D/D","I/I","D/D","D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

heterozygote(g1)
homozygote(g1)

carrier(g1,"D")
carrier(g1,"D",na.rm=TRUE)

# get count of one allele 
allele.count(g1,"D")

# get count of each allele
allele.count(g1)  # equivalent to
allele.count(g1, c("D","I"), any=FALSE)

# get combined count for both alleles
allele.count(g1,c("I","D"))

# get second allele
allele(g1,2)

# get both alleles
allele(g1)

}
\keyword{ misc }
%\keyword{genetics}%-- one or more ...

\eof
\name{library.pos}
\title{Loading and Listing of Packages (backported from 1.8.0)}
\alias{library.pos}
\description{
  \code{library.pos} loads add-on packages, allowing specification of
  position in the search path.
}
\usage{
library.pos(package, help, pos = 2, lib.loc = NULL, character.only = FALSE,
            logical.return = FALSE, warn.conflicts = TRUE,
            keep.source = getOption("keep.source.pkgs"),
            verbose = getOption("verbose"), version)
}
\arguments{
  \item{package, help}{\link{name} or character string giving the name
    of a package.}
  \item{pos}{the position on the search list at which to attach the
    loaded package. Note that \code{.First.lib} may attach other
    packages, and \code{pos} is computed \emph{after} \code{.First.lib}
    has been run.}
  \item{lib.loc}{a character vector describing the location of \R
    library trees to search through, or \code{NULL}.  The default value
    of \code{NULL} corresponds to all libraries currently known.}
  \item{character.only}{a logical indicating whether \code{package} or
    \code{help} can be assumed to be character strings.}
  \item{version}{A character string denoting a version number of the 
     package to be loaded.  If no version is given, a suitable default
     is chosen.}
  \item{logical.return}{logical.  If it is \code{TRUE},  \code{FALSE} or
      \code{TRUE} is returned to indicate success.}
  \item{warn.conflicts}{logical.  If \code{TRUE}, warnings are
    printed about \code{\link{conflicts}} from attaching the new
    package, unless that package contains an object \code{.conflicts.OK}.}
  \item{keep.source}{logical.  If \code{TRUE}, functions ``keep their
    source'' including comments, see argument \code{keep.source} to
    \code{\link{options}}.}
  \item{verbose}{a logical.  If \code{TRUE}, additional diagnostics are
    printed.}
}
\details{

  This function is backported from the R 1.8.0 development tree and is 
  only defined by gregmisc if this package is loaded in an earlier
  version of R.  In 1.8.0, this function is merely an alias for the
  standard \code{library} function.  The only important difference is
  the addition of the "pos" argument which allows an imported package to
  be loaded further down the search path than the top entry.

  See the documentation for \code{library} for more details.
}
\seealso{
  \code{\link[base]{library}}
}
\keyword{data}

\eof
% $Id: locus.Rd,v 1.10 2004/05/25 19:40:02 warnesgr Exp $
%
% %Log$
%

\name{locus}
\alias{locus}
\alias{gene}
\alias{marker}
\alias{is.gene}
\alias{is.locus}
\alias{is.marker}
\alias{print.gene}
\alias{print.locus}
\alias{print.marker}
\alias{as.character.locus}
\alias{as.character.gene}
\alias{as.character.marker}
\alias{getlocus}
\alias{getmarker}
\alias{getgene}
\alias{locus<-}
\alias{marker<-}
\alias{gene<-}


%- Also NEED an `\alias' for EACH other topic documented here.
\title{ Create and Manipulate Locus, Gene, and Marker Objects}
\description{
  \code{locus}, \code{gene}, and \code{marker} create objects to store
  information, respectively, about genetic loci, genes, and markers.

  \code{is.locus}, \code{is.gene}, and \code{ismarker} test whether an
  object is a member of the respective class.

  \code{as.character.locus}, \code{as.character.gene},
  \code{as.character.marker} return a character string containing a
  compact encoding the object.
  
  \code{getlocus}, \code{getgene}, \code{getmarker} extract locus data
  (if present) from another object.

  \code{locus<-}, \code{marker<-}, and \code{gene<-} adds locus data to
  an object.

}
\usage{
  locus(name, chromosome, arm=c("p", "q", "long", "short", NA),
        index.start, index.end=NULL)

  gene(name, chromosome, arm=c("p", "q", "long", "short"),
       index.start, index.end=NULL)

  marker(name, type, locus.name, bp.start, bp.end = NULL,
         relative.to = NULL, ...)

  is.locus(x)

  is.gene(x)

  is.marker(x)

  as.character.locus(x, ...)

  as.character.gene(x, ...)

  as.character.marker(x, ...)

  getlocus(x, ...)

  locus(x) <- value

  marker(x) <- value

  gene(x) <- value

}
\arguments{
  \item{name}{character string giving locus, gene, or marker name}
  \item{chromosome}{integer specifying chromosome number (1:23 for humans).}
  \item{arm}{character indicating long or short arm of the chromosome.
    Long is be specified by "long" or "p".  Short is specified by
    "short" or "q".}
  \item{index.start}{integer specifying location of start of locus or
    gene on the chromosome.  }
  \item{index.end}{optional integer specifying location of end of locus or
    gene on the chromosome.  }
  \item{type}{character string indicating marker type, e.g. "SNP"}
  \item{locus.name}{either a character string giving the name of the
    locus or gene (other details may be specified using \code{...}) or a
    \code{locus} or \code{gene} object.}
  \item{bp.start}{start location of marker, in base pairs}
  \item{bp.end}{end location of marker, in base pairs (optional)}
  \item{relative.to}{location (optional) from which \code{bp.start} and
    \code{bp.end} are calculated. }
  \item{...}{parameters for \code{locus} used to fill in additional
    details on the locus or gene within which the marker is located. }
  \item{x}{an object of class \code{locus}, \code{gene}, or
    \code{marker}, or (for \code{getlocus}, \code{locus<-},
    \code{marker<-}, and \code{gene<-}) an object that may contain a locus
    attribute or field, notably a \code{genotype} object.}
  \item{value}{\code{locus}, \code{marker}, or \code{gene} object}
}
%\details{
%  ~~ If necessary, more details than the __description__  above ~~
%}
\value{
  Object of class \code{locus} and \code{gene}are lists with the
  elements:
  \item{name}{character string giving locus, gene, or marker name}
  \item{chromosome}{integer specifying chromosome number (1:23 for humans).}
  \item{arm}{character indicating long or short arm of the chromosome.
    Long is be specified by "long" or "p".  Short is specified by
    "short" or "q".}
  \item{index.start}{integer specifying location of start of locus or
    gene on the chromosome.  }
  \item{index.end}{optional integer specifying location of end of locus or
    gene on the chromosome.  }

  Objects of class \code{marker} add the additional fields:
  \item{marker.name}{character string giving the name of the marker}
  \item{bp.start}{start location of marker, in base pairs}
  \item{bp.end}{end location of marker, in base pairs (optional)}
  \item{relative.to}{location (optional) from which \code{bp.start} and
    \code{bp.end} are calculated. }
    
}
%\references{ ~put references to the literature/web site here ~ }
\author{Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com} }

\seealso{
  \code{\link{genotype}},
}

\examples{
ar2  <- gene("AR2",chromosome=7,arm="q",index.start=35)
ar2

par  <- locus(name="AR2 Psedogene", 
              chromosome=1, 
              arm="q",
              index.start=32,
              index.end=42)
par

c109t  <- marker(name="C-109T",
                 type="SNP",
                 locus.name="AR2",
                 chromosome=7, 
                 arm="q", 
                 index.start=35,
                 bp.start=-109,
                 relative.to="start of coding region")
c109t

c109t  <- marker(name="C-109T",
                 type="SNP",
                 locus=ar2,
                 bp.start=-109,
                 relative.to="start of coding region")
c109t




example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data, locus=ar2)
g1

getlocus(g1)

summary(g1)
HWE.test(g1)

g2  <- genotype(example.data, locus=c109t)
summary(g2)

getlocus(g2)

heterozygote(g2)
homozygote(g1)

allele(g1,1)

carrier(g1,"I")

heterozygote(g2)
}
\keyword{ misc }
%\keyword{genetics}%-- one or more ...

\eof
% $Id: makeGenotypes.Rd,v 1.5 2004/05/25 19:40:02 warnesgr Exp $
%
% $Log: makeGenotypes.Rd,v $
% Revision 1.5  2004/05/25 19:40:02  warnesgr
% Many fixex.
%
% Revision 1.4  2003/05/27 18:45:52  warnesgr
%
% - Fix typos, update arguments to match changes to code
%
% Revision 1.3  2003/05/20 16:46:09  warnesgr
%
% - Reduce stringency for considering a variable a genotype.
%
% Revision 1.2  2003/05/16 18:39:49  warnesgr
%
% - Updated to version 0.7.0
% - Made changes to pass R CMD check
%

\name{makeGenotypes}
\alias{makeGenotypes}
\alias{makeHaplotypes}
\title{Convert columns in a dataframe to genotypes or haplotypes}
\description{ 
  Convert columns in a dataframe to genotypes or haplotypes.
  }
\usage{
makeGenotypes(data, convert, sep = "/", tol = 0.5, ..., method=as.genotype)
makeHaplotypes(data, convert, sep = "/", tol = 0.9, ...)
}
\arguments{
  \item{data}{Dataframe containing columns to be converted}
  \item{convert}{Vector or list of pairs specifying which columns
    contain genotype/haplotype data.   See below for details.}
  \item{sep}{Genotype separator}
  \item{tol}{See below.}
  \item{\dots}{Optional arguments to as.genotype function}
  \item{method}{For internal use only.}
}
\details{
  The functions makeGenotypes and makeHaplotypes allow the conversion of
  all of the genetic variables in a dataset to genotypes or haplotypes
  in a single step.

  The parameter \code{convert} may be missing, a vector of
  column names, indexes or true/false indictators, or a list of column
  name or index pairs.

  When the argument \code{convert} is not provided, the function will
  look for columns where at least \code{tol}*100\% of the records
  contain the separator character \code{sep} ('/' by default).  These
  columns will then be assumed to contain both genotype/haplotype alles
  and will be converted in-place to genotype variables.
  
  When the argument \code{convert} is a vector of column names, indexes
  or true/false indictators, the corresponding columns will be assumed
  to contain both genotype/haplotype alles and will be converted
  in-place to genotype variables.

  When the argument \code{convert} is a list containing column name or
  index pairs, the two elements of each pair will be assumed to contain the
  individual alleles of a genotype/haplotype.  The first column
  specified in each pair will be replaced with the new
  genotype/haplotype variable and the column will be renamed to
  name1 + sep + name2.
}
\value{
  Dataframe containing converted genotype/haplotype variables. All other
  variables will be unchanged.
}
\author{ Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com }
  }
\seealso{
  \code{\link{genotype}}
}
\examples{
\dontrun{
# common case
data <- read.csv(file="genotype_data.csv")
data <- makeGenotypes(data)
}


# Create a test data set where there are several genotypes in columns
# of the form "A/T".
test1 <- data.frame(Tmt=sample(c("Control","Trt1","Trt2"),20, replace=TRUE),
                G1=sample(c("A/T","T/T","T/A",NA),20, replace=TRUE),
                N1=rnorm(20),
                I1=sample(1:100,20,replace=TRUE),
                G2=paste(sample(c("134","138","140","142","146"),20,
                                replace=TRUE),
                         sample(c("134","138","140","142","146"),20,
                                replace=TRUE),
                         sep=" / "),
                G3=sample(c("A /T","T /T","T /A"),20, replace=TRUE),
                comment=sample(c("Possible Bad Data/Lab Error",""),20,
                               rep=TRUE)
                )
test1

# now automatically convert genotype columns
geno1 <- makeGenotypes(test1)
geno1

# Create a test data set where there are several haplotypes with alleles
# in adjacent columns.
test2 <- data.frame(Tmt=sample(c("Control","Trt1","Trt2"),20, replace=TRUE),
                    G1.1=sample(c("A","T",NA),20, replace=TRUE),
                    G1.2=sample(c("A","T",NA),20, replace=TRUE),
                    N1=rnorm(20),
                    I1=sample(1:100,20,replace=TRUE),
                    G2.1=sample(c("134","138","140","142","146"),20,
                                replace=TRUE),
                    G2.2=sample(c("134","138","140","142","146"),20,
                                replace=TRUE),
                    G3.1=sample(c("A ","T ","T "),20, replace=TRUE),
                    G3.2=sample(c("A ","T ","T "),20, replace=TRUE),
                    comment=sample(c("Possible Bad Data/Lab Error",""),20,
                                   rep=TRUE)
                   ) 
test2

# specifly the locations of the columns to be paired for haplotypes
makeHaplotypes(test2, convert=list(c("G1.1","G1.2"),6:7,8:9))
}
\keyword{ misc }

\eof
\name{print.LD}
\alias{print.LD}
\alias{print.LD.data.frame}
\alias{summary.LD.data.frame}
\alias{print.summary.LD.data.frame}
\alias{plot.LD.data.frame}
\alias{LDtable}
\alias{LDplot}
\title{Textual and graphical display of linkage disequilibrium (LD) objects}
\description{
  Textual and graphical display of linkage disequilibrium (LD) objects
}
\usage{
print.LD(x, digits = getOption("digits"), ...)
print.LD.data.frame(x, ...)

summary.LD.data.frame(object, digits = getOption("digits"),
                      which = c("D", "D'", "r", "X^2", "P-value", "n", " "),
                      rowsep, show.all = FALSE, ...)
print.summary.LD.data.frame(x, digits = getOption("digits"), ...) 

plot.LD.data.frame(x,digits=3, colorcut=c(0,0.01, 0.025, 0.5, 0.1, 1),
                   colors=heat.colors(length(colorcut)), textcol="black",
                   marker, which="D'", distance,  ...)


LDtable(x, colorcut=c(0,0.01, 0.025, 0.5, 0.1, 1),
        colors=heat.colors(length(colorcut)), textcol="black",
        digits=3, show.all=FALSE, which=c("D", "D'", "r", "X^2",
        "P-value", "n"), colorize="P-value", cex, ...)

LDplot(x, digits=3, marker, distance, which=c("D", "D'", "r", "X^2",
       "P-value", "n", " "), ... ) 
}
\arguments{
  \item{x,object}{LD or LD.data.frame object}
  \item{digits}{Number of significant digits to display}
  \item{which}{Name(s) of LD information items to be displayed}
  \item{rowsep}{Separator between rows of data, use \code{NULL} for no
    separator.} 
  \item{colorcut}{P-value cutoffs points for colorizing LDtable}
  \item{colors}{Colors for each P-value cutoff given in \code{colorcut} for
    LDtable}
  \item{textcol}{Color for text labels for LDtable}
  \item{marker}{Marker used as 'comparator' on LDplot.  If
    omitted separate lines for each marker will be displayed}
  \item{distance}{Marker location, used for locating of markers on
    LDplot.}
  \item{show.all}{If TRUE, show all rows/columns of matrix. Otherwise
    omit completely blank rows/columns.}
  \item{colorize}{LD parameter used for determining table cell colors}
  \item{cex}{Scaling factor for table text. If absent, text will be
    scaled to fit within the table cells.}
  \item{\dots}{Optional arguments (\code{plot.LD.data.frame} passes
    these to \code{LDtable} and \code{LDplot})}
}
%\details{
%}
\value{
  None.
}
%\references{ ~put references to the literature/web site here ~ }
\author{ Gregory R. Warnes \email{gregory\_r\_warnes@groton.pfizer.com} }
\seealso{ \code{LD}, \code{genotype}, \code{HWE.test} }
\examples{


g1 <- genotype( c('T/A',    NA, 'T/T',    NA, 'T/A',    NA, 'T/T', 'T/A',
                  'T/T', 'T/T', 'T/A', 'A/A', 'T/T', 'T/A', 'T/A', 'T/T',
                     NA, 'T/A', 'T/A',   NA) )

g2 <- genotype( c('C/A', 'C/A', 'C/C', 'C/A', 'C/C', 'C/A', 'C/A', 'C/A',
                  'C/A', 'C/C', 'C/A', 'A/A', 'C/A', 'A/A', 'C/A', 'C/C',
                  'C/A', 'C/A', 'C/A', 'A/A') )


g3 <- genotype( c('T/A', 'T/A', 'T/T', 'T/A', 'T/T', 'T/A', 'T/A', 'T/A',
                  'T/A', 'T/T', 'T/A', 'T/T', 'T/A', 'T/A', 'T/A', 'T/T',
                  'T/A', 'T/A', 'T/A', 'T/T') )
data <- makeGenotypes(data.frame(g1,g2,g3))

# Compute & display  LD for one marker pair
ld <- LD(g1,g2)
print(ld)

# Compute LD table for all 3 genotypes
ldt <- LD(data)

# display the results
print(ldt)                               # textual display
LDtable(ldt)                            # graphical color-coded table
LDplot(ldt, distance=c(124, 834, 927))  # LD plot vs distance

# more markers makes prettier plots!
data <- list()
nobs <- 1000
ngene <- 20
s <- seq(0,1,length=ngene)
a1 <- a2 <- matrix("", nrow=nobs, ncol=ngene)
for(i in 1:length(s) )
{

  rallele <- function(p) sample( c("A","T"), 1, p=c(p, 1-p))

  if(i==1)
    {
      a1[,i] <- sample( c("A","T"), 1000, p=c(0.5,0.5), replace=TRUE)
      a2[,i] <- sample( c("A","T"), 1000, p=c(0.5,0.5), replace=TRUE)
    }
  else
    {
      p1 <- pmax( pmin( 0.25 + s[i] * as.numeric(a1[,i-1]=="A"),1 ), 0 )
      p2 <- pmax( pmin( 0.25 + s[i] * as.numeric(a2[,i-1]=="A"),1 ), 0 )
      a1[,i] <- sapply(p1, rallele )
      a2[,i] <- sapply(p2, rallele )
    }

  data[[paste("G",i,sep="")]] <- genotype(a1[,i],a2[,i])
}
data <- data.frame(data)
data <- makeGenotypes(data)

ldt <- LD(data)
plot(ldt, digits=2, marker=19) # do LDtable & LDplot on in a single
                               # graphics window
}
\keyword{misc}


\eof
% $Id: summary.genotype.Rd,v 1.12 2003/05/27 18:45:52 warnesgr Exp $
%
% %Log$
%

\name{summary.genotype}

\alias{summary.genotype}
\alias{print.summary.genotype}

\title{ Allele and Genotype Frequency from a Genotype or
  Haplotype Object}

\description{
    \code{summary.genotype} creates an object containing allele and
    genotype frequency from a \code{genotype} or \code{haplotype}
    object.  \code{print.summary.genotype} displays a
    \code{summary.genotype} object.
}

\usage{
  summary.genotype(object, ..., maxsum)
  print.summary.genotype(x,...,round=2)
}

\arguments{
  
  \item{object, x}{ an object of class \code{genotype} or \code{haplotype} (for
    \code{summary.genotype}) or an object of class
    \code{summary.genotype} (for \code{print.summary.genotype}) }
  \item{\dots}{ optional parameters.  Ignored by \code{summary.genotype},
    passed to \code{print.matrix} by \code{print.summary,genotype}.}
  \item{maxsum}{ specifying any value for the parameter
    maxsum will cause \code{summary.genotype} to fall back to
    \code{summary.factor}.}
  \item{round}{ number of digits to use when displaying proportions.}
 }

\details{
 Specifying any value for the parameter \code{maxsum} will cause fallback
 to \code{summary.factor}.  This is so that the function
 \code{summary.dataframe} will give reasonable output when it contains a
 genotype column.  (Hopefully we can figure out something better to do
 in this case.)
}


\value{
  The returned value of \code{summary.genotype} is an object of class
  \code{summary.genotype} which
  is a list with the following components:
  
  \item{locus }{locus information field (if present) from \code{x}}.

  \item{allele.names}{ vector of allele names }
  
  \item{allele.freq }{
    A two column matrix with one row for each allele, plus one row for
    \code{NA} values (if present).  The first column, \code{Count},
    contains the frequency of the corresponding allele value.  The
    second column, \code{Proportion}, contains the fraction of alleles
    with the corresponding allele value.  Note each observation contains
    two alleles, thus the \code{Count} field sums to twice the number of
    observations.
  }
  
  \item{genotype.freq}{
    A two column matrix with one row for each genotype, plus one row for
    \code{NA} values (if present). The first column, \code{Count}, contains the
    frequency of the corresponding genotype.  The second column,
    \code{Proportion}, contains the fraction of genotypes with the
    corresponding value.
  }

  \code{print.summary.genotype} silently returns the object \code{x}.
}
%\references{ ~put references to the literature/web site here ~ }
\author{ Gregory R. Warnes \email{Gregory\_R\_Warnes@groton.pfizer.com} }
%\note{ ~~further notes~~ }
\seealso{
  \code{\link{genotype}},
  \code{\link{HWE.test}},
  \code{\link{allele}},
  \code{\link{homozygote}},
  \code{\link{heterozygote}}, 
  \code{\link{carrier}},
%  \code{\link{summary.genotype}},
  \code{\link{allele.count}}
  \code{\link{locus}}
  \code{\link{gene}}
  \code{\link{marker}}
  }

\examples{

example.data   <- c("D/D","D/I","D/D","I/I","D/D",
                    "D/D","D/D","D/D","I/I","")
g1  <- genotype(example.data)
g1

summary(g1)
}
\keyword{ misc }
%\keyword{genetics}

\eof
# $Id: undocumented.Rd,v 1.7 2004/05/25 19:40:02 warnesgr Exp $
#
\name{undocumented}
\alais{ci.balance}
\alias{as.factor}
\alias{allele.count.2.genotype}
\alias{as.factor.allele.genotype}
\alias{as.factor.default}
\alias{as.factor.genotype}
\alias{shortsummary.genotype}
\alias{geno.as.array}
\alias{mknum}
\alias{hap}
\alias{hapshuffle}
\alias{hapenum}
\alias{hapfreq}
\alias{hapmcmc}
\alias{mourant}
\alias{hapambig}
\title{Undocumented functions}
\description{
  These functions are undocumented.  Some are internal and not intended
  for direct use.  Some are not yet ready for end users.  Others simply
  haven't been documented yet.
}
\usage{
}
\author{Gregory R. Warnes}
\keyword{ misc }
%\keyword{genetics}


\eof
\name{write.pop.file}
\alias{write.pop.file}
\alias{write.pedigree.file}
\alias{write.marker.file}
%- Also NEED an '\alias' for EACH other topic documented here.
\title{Create genetics data files}
\description{
  \code{write.pop.file} creates a 'pop' data file, as used by the
  GenePop (\url{http://wbiomed.curtin.edu.au/genepop/}) and LinkDos
  (\url{http://wbiomed.curtin.edu.au/genepop/linkdos.html}) software
  packages.

  \code{write.pedigree.file} creates a 'pedigree' data file, as used
  by the QTDT software package
  (\url{http://www.sph.umich.edu/statgen/abecasis/QTDT/}).
  
  \code{write.marker.file} creates a 'marker' data file, as used by
  the QTDT software package 
  (\url{http://www.sph.umich.edu/statgen/abecasis/QTDT/}).
}
\usage{
write.pop.file(data, file = "", digits = 2, description = "Data from R")
write.pedigree.file(data, family, pid, father, mother, sex,
                    file="pedigree.txt")
write.marker.file(data, location, file="marker.txt")
}
\arguments{
  \item{data}{Data frame containing genotype objects to be exported}
  \item{file}{Output filename}
  \item{digits}{Number of digits to use in numbering genotypes, either 2
    or 3.}
  \item{description}{Description to use as the first line of the 'pop'
    file.}
  \item{family, pid, father, mother}{Vector of family, individual,
    father, and mother id's, respectively.}
  \item{sex}{Vector giving the sex of the individual (1=Make, 2=Female)}
  \item{location}{Location of the marker relative to the gene of
    interest, in base pairs.}
}
\details{
  The format of 'Pop' files is documented at
  \url{http://wbiomed.curtin.edu.au/genepop/help_input.html}, the format
  of 'pedigree' files is documented at \url{http://www.sph.umich.edu/csg/abecasis/GOLD/docs/pedigree.html} and the format of 'marker'
  files is documented at
  \url{http://www.sph.umich.edu/csg/abecasis/GOLD/docs/map.html}.
}
\value{
  No return value.
}
\author{Gregory R. Warnes \email{gregory\_r\_warnes@groton.pfizer.com}}
\seealso{\code{\link{write.table}}}
\examples{
  # TBA
}
\keyword{IO}

\eof
