\name{Cs}
\alias{Cs}
\title{
Character strings from unquoted names
}
\description{
Makes a vector of character strings from a list of valid S names
}
\usage{
Cs(\dots)
}
\arguments{
\item{...}{
any number of names separated by commas
}}
\value{
character string vector
}
\seealso{
sys.frame, deparse
}
\examples{
Cs(a,cat,dog)
# subset.data.frame <- dataframe[,Cs(age,sex,race,bloodpressure,height)]
}
\keyword{character}
\keyword{utilities}
% Converted by Sd2Rd version 1.21.

\eof
\name{Hmisc-internal}
\title{Internal Hmisc functions}
\alias{dataDensityString}
\alias{as.double.Cbind}
\alias{as.numeric.Cbind}
\alias{format.sep}
\alias{as.category}
\alias{as.data.frame.impute}
\alias{as.data.frame.roundN}
\alias{as.data.frame.special.miss}
\alias{as.data.frame.Surv}
\alias{as.data.frame.substi}
\alias{substi}
\alias{substi.source}
\alias{[.substi}
\alias{[.factor}
\alias{bpx}
\alias{do}
\alias{dot.chart}
\alias{existsFunction}
\alias{expr.tree}
\alias{fillin}
\alias{formatCats}
\alias{formatCons}
\alias{formatDateTime}
\alias{formatTestStats}
\alias{format.timePOSIXt}
\alias{ftuss}
\alias{ftupwr}
\alias{getFunction}
\alias{get2rowHeads}
\alias{groupn}
\alias{gView}
\alias{is.category}
\alias{is.present}
\alias{mask}
\alias{nafitted.delete}
\alias{na.include}
\alias{Names2names}
\alias{naprint.keep}
\alias{naresid.keep}
\alias{naprint.delete}
\alias{naresid.delete}
\alias{oldUnclass}
\alias{oPar}
\alias{optionsCmds}
\alias{ordGridFun}
\alias{parGrid}
\alias{pasteFit}
\alias{print.substi}
\alias{print.timePOSIXt}
\alias{read.xportDataload}
\alias{recode}
\alias{rowsumFast}
\alias{sas.get.macro}
\alias{setParNro}
\alias{stepfun.eval}
\alias{[.terms}
\alias{termsDrop}
\alias{testDateTime}
\alias{uncbind}
\alias{untangle.specials}
\alias{var.inner}
\alias{xInch}
\alias{xySortNoDupNoNA}
\alias{yInch}
\alias{zoom}
\description{Internal Hmisc functions.}
\details{These are not to be called by the user or are undocumented.}
\keyword{internal}

\eof
\name{Lag}
\alias{Lag}
\title{Lag a Numeric, Character, or Factor Vector}
\description{
Shifts a vector \code{shift} elements later.  Character or factor
variables are padded with \code{""}, numerics with \code{NA}.
}
\usage{
Lag(x, shift = 1)
}
\arguments{
  \item{x}{a vector}
  \item{shift}{positive integer specifying the number of observations to
	be shifted to the right}
}
\details{
A.ttributes of the original object are carried along to the new lagged
one, but factor vectors are converted to character.
}
\value{
a vector like \code{x}
}
\author{Frank Harrell}
\seealso{\code{\link{lag}}}
\examples{
Lag(1:5,2)
Lag(letters[1:4],2)
Lag(factor(letters[1:4]),2)
# Find which observations are the first for a given subject
id <- c('a','a','b','b','b','c')
id != Lag(id)
!duplicated(id)
}
\keyword{manip}


\eof
\name{Misc}
\alias{confbar}
\alias{james.stein}
\alias{km.quick}
\alias{lm.fit.qr.bare}
\alias{matxv}
\alias{nomiss}
\alias{outerText}
\alias{sepUnitsTrans}
\alias{trap.rule}
\alias{trellis.strip.blank}
\alias{under.unix}
\alias{.R.}
\alias{.SV4.}
\alias{unPaste}
\alias{whichClosest}
\alias{whichClosePW}
\alias{xless}
\title{Miscellaneous Functions}
\description{
  This documents miscellaneous small functions in Hmisc that may be of
  interest to users.

  \code{confbar} draws multi-level confidence bars using small rectangles
  that may be of different colors.

  \code{james.stein} computes James-Stein shrunken estimates of cell
  means given a response variable (which may be binary) and a grouping
  indicator.

  \code{km.quick} provides a fast way to invoke \code{survfit.km} in the
  \code{survival} package to get Kaplan-Meier estimates for a
  single stratum for a vector of time points (if \code{times} is given) or to
  get a vector of survival time quantiles (if \code{q} is given).

  \code{lm.fit.qr.bare} is a fast stripped-down function for computing
  regression coefficients, residuals, \eqn{R^2}, and fitted values.  It
  uses the Fortran routines \code{dqrls}. 

  \code{matxv} multiplies a matrix by a vector, handling automatic
  addition of intercepts if the matrix does not have a column of ones.
  If the first argument is not a matrix, it will be converted to one.

  \code{nomiss} returns a data frame (if its argument is one) with rows
  corresponding to \code{NA}s removed, or it returns a matrix with rows
  with any element missing removed.

  \code{outerText} uses \code{text()} to put test strings in left or
  right margins. It temporarily sets \code{par(xpd=NA)} if using \R.

  \code{sepUnitsTrans} converts character vectors containing values such
  as \code{c("3 days","3day","4month","2 years","7")} to numeric vectors
  (here \code{c(3,3,122,730,7)}) in a flexible fashion.  The user can specify a
  vector of units of measurements and conversion factors.  The units
  with a conversion factor of \code{1} are taken as the target units,
  and if those units are present in the character strings they are
  ignored.  The target units are added to the resulting vector as the
  \code{"units"} attribute.
  
  \code{trap.rule} computes the area under a curve using the trapezoidal
  rule, assuming \code{x} is sorted.

  \code{trellis.strip.blank} sets up Trellis or Lattice graphs to have a
  clear background on the strips for panel labels.
  
  \code{under.unix} is a scalar logical value that is \code{TRUE} if you
  are running Linux or Unix.

  \code{.R.} is a logical value set to \code{TRUE} if running \R,
  \code{FALSE} for S-Plus.

  \code{.SV4.} is a logical value set to \code{TRUE} if running version
  4 of the S language under S-Plus, \code{FALSE} otherwise.
  
  \code{unPaste} provides a version of the S-Plus \code{unpaste} that
  works for \R and S-Plus.

  \code{whichClosePW} is a very fast function using weighted multinomial
  sampling to determine which element of a vector is "closest" to each
  element of another vector.  \code{whichClosest} quickly finds the closest
  element without any randomness.

  \code{xless} is a function for Linux/Unix users to invoke the system
  \code{xless} command to pop up a window to display the result of
  \code{print}ing an object.
}
\usage{
confbar(at, est, se, width, q = c(0.7, 0.8, 0.9, 0.95, 0.99), 
        col = if (.R.) gray(c(0, 0.25, 0.5, 0.75, 1)) else
              if (under.unix) c(1, 0.8, 0.5, 0.2, 0.065) else
              c(1, 4, 3, 2, 5),
        type = c("v", "h"), labels = TRUE, ticks = FALSE,
        cex = 0.5, side = "l", lwd = 5, clip = c(-1e+30, 1e+30),
        fun = function(x) x,
        qfun = function(x) ifelse(x == 0.5, qnorm(x),
                            ifelse(x < 0.5, qnorm(x/2),
                            qnorm((1 +  x)/2))))
james.stein(y, group)
km.quick(S, times, q)
lm.fit.qr.bare(x, y, tolerance, intercept=TRUE, xpxi=FALSE)
matxv(a, b, kint=1)
nomiss(x)
outerText(string, y, setAside=string[1], side=4, space=1,
          adj=1, cex=par('cex'))
sepUnitsTrans(x, conversion=c(day=1, month=365.25/12, year=365.25),
              round=FALSE, digits=0)
trap.rule(x, y)
trellis.strip.blank()
unPaste(str, sep="/", extended=FALSE)
whichClosest(x, w)
whichClosePW(x, w, f=0.2)
xless(x, \dots, title)
}
\arguments{
  \item{at}{x-coordinate for vertical confidence intervals, y-coordinate
	for horizontal}
  \item{est}{vector of point estimates for confidence limits}
  \item{se}{vector of standard errors}
  \item{width}{width of confidence rectanges in user units}
  \item{q}{vector of confidence coefficients or quantiles}
  \item{col}{vector of colors}
  \item{type}{\code{"v"} for vertical, \code{"h"} for horizontal}
  \item{labels}{set to \code{FALSE} to omit drawing confidence
	coefficients}
  \item{ticks}{set to \code{TRUE} to draw lines between rectangles}
  \item{cex}{character expansion factor}
  \item{side}{for \code{confbar} is \code{"b","l","t","r"} for bottom,
	left, top, right.  For \code{outText} is the using integers 1-4
	corresponding to these.}
  \item{lwd}{line widths}
  \item{clip}{interval to truncate limits}
  \item{fun}{function to transform scale}
  \item{qfun}{quantiles on transformed scale}
  \item{group}{a categorical grouping variable}
  \item{S}{a \code{\link{Surv}} object}
  \item{times}{a numeric vector of times}
  \item{tolerance}{tolerance for judging singularity in matrix}
  \item{intercept}{set to \code{FALSE} to not automatically add a column
	of ones to the \code{x} matrix}
  \item{xpxi}{set to \code{TRUE} to add an element to the result
	containing the inverse of \eqn{X'X}}
  \item{a}{a numeric matrix or vector}
  \item{b}{a numeric vector}
  \item{kint}{which element of \code{b} to add to the result if \code{a}
	does not contain a column for intercepts}
  \item{string}{a character string vector}
  \item{setAside}{for \code{adj=1 side=4}, is a character string used to
	determine the space to set aside for all strings.}
  \item{space}{the number of extra characters to leave to the left of
	the string(s) (\code{adj=0}) or to the right (\code{adj=1})}
  \item{adj}{0 for left justification, 0.5 for centered, 1 for right}
  \item{conversion}{a named numeric vector}
  \item{round}{set to \code{TRUE} to round converted values}
  \item{digits}{number of digits used for \code{round}}
  \item{str}{a character string vector}
  \item{w}{a numeric vector}
  \item{x}{a numeric vector (matrix for \code{lm.fit.qr.bare}) or data
	frame.  For \code{xless} may be any object that is sensible to
	\code{print}.  For \code{sepUnitsTrans} is a character or factor
	variable.}
  \item{y}{a numeric vector}
  \item{sep}{a single character string specifying the delimiter}
  \item{extended}{see \code{\link{strsplit}} in \R}
  \item{f}{a scaling constant}
  \item{title}{a character string to title a window or plot}
  \item{\dots}{arguments passed through to another function}
}
\author{Frank Harrell}
\examples{
trap.rule(1:100,1:100)

unPaste(c('a;b or c','ab;d','qr;s'), ';')

sepUnitsTrans(c('3 days','4 months','2 years','7'))

set.seed(1)
whichClosest(1:100, 3:5)
whichClosest(1:100, rep(3,20))

whichClosePW(1:100, rep(3,20))
whichClosePW(1:100, rep(3,20), f=.05)
whichClosePW(1:100, rep(3,20), f=1e-10)
}
\keyword{programming}
\keyword{utilities}

\eof
\name{Overview}
\alias{Overview}
\alias{Hmisc.Overview}
\title{
	Overview of Hmisc Library
}
\description{
The Hmisc library contains many functions
useful for data analysis, high-level
graphics, utility operations, functions for
computing sample size and power, translating
SAS datasets into S, imputing missing
values, advanced table making, variable
clustering, character string manipulation,
conversion of S objects to LaTeX code,
recoding variables, and bootstrap repeated
measures analysis.  Most of these functions
were written by F Harrell, but a few were
collected from statlib and from s-news;
other authors are indicated below.  This
collection of functions includes all of 
Harrell's submissions to statlib other than 
the functions in the Design and display 
libraries.  A few of the functions do not 
have "Help" documentation.}

\section{Functions}{
\tabular{ll}{
\bold{Function Name} \tab  \bold{Purpose} \cr
abs.error.pred  \tab Computes various indexes of predictive accuracy based\cr
\tab    on absolute errors, for linear models\cr
all.is.numeric  \tab Check if character strings are legal numerics\cr
approxExtrap    \tab Linear extrapolation\cr
aregImpute      \tab Multiple imputation based on additive regression,\cr
                \tab     bootstrapping, and predictive mean matching\cr
areg.boot       \tab Nonparametrically estimate transformations for both\cr
                \tab     sides of a multiple additive regression, and\cr
                \tab     bootstrap these estimates and \eqn{R^2}\cr
ballocation     \tab Optimum sample allocations in 2-sample proportion test\cr
binconf         \tab Exact confidence limits for a proportion and more accurate\cr
                \tab     (narrower!) score stat.-based Wilson interval\cr
                \tab     (Rollin Brant, mod. FEH)\cr
bootkm          \tab Bootstrap Kaplan-Meier survival or quantile estimates\cr
bpower          \tab Approximate power of 2-sided test for 2 proportions\cr
                \tab     Includes bpower.sim for exact power by simulation\cr
bpplot          \tab Box-Percentile plot \cr
                \tab     (Jeffrey Banfield, \email{umsfjban@bill.oscs.montana.edu})\cr
bsamsize        \tab Sample size requirements for test of 2 proportions\cr
bystats         \tab Statistics on a single variable by levels of >=1 factors\cr
bystats2        \tab 2-way statistics\cr
calltree        \tab Calling tree of functions \cr
                \tab     (David Lubinsky, \email{david@hoqax.att.com})\cr
character.table \tab Shows numeric equivalents of all latin characters\cr
                \tab     Useful for putting many special chars. in graph titles\cr
                \tab     (Pierre Joyet, \email{pierre.joyet@bluewin.ch})\cr
ciapower        \tab Power of Cox interaction test\cr
cleanup.import  \tab More compactly store variables in a data frame, and clean up\cr
                \tab     problem data when e.g. Excel spreadsheet had a non-\cr
                \tab     numeric value in a numeric column\cr
combine.levels  \tab Combine infrequent levels of a categorical variable\cr
comment         \tab Attach a comment attribute to an object:\cr
                \tab     comment(fit) <- 'Used old data'\cr
                \tab     comment(fit)    (prints comment)\cr
confbar         \tab Draws confidence bars on an existing plot using multiple\cr
                \tab     confidence levels distinguished using color or gray scale\cr
contents        \tab Print the contents (variables, labels, etc.) of a data frame\cr
cpower          \tab Power of Cox 2-sample test allowing for noncompliance\cr
Cs              \tab Vector of character strings from list of unquoted names\cr
csv.get         \tab Enhanced importing of comma separated files labels\cr
cut2            \tab Like cut with better endpoint label construction and allows\cr
                \tab     construction of quantile groups or groups with given n\cr
datadensity     \tab Snapshot graph of distributions of all variables in\cr
                \tab     a data frame.  For continuous variables uses scat1d.\cr
dataRep         \tab Quantify representation of new observations in a database\cr
ddmmmyy         \tab SAS "date7" output format for a chron object\cr
deff            \tab Kish design effect and intra-cluster correlation\cr
describe        \tab Function to describe different classes of objects.\cr
                \tab     Invoke by saying describe(object). It calls one of the\cr
                \tab     following:\cr
describe.data.frame
                \tab Describe all variables in a data frame (generalization\cr
                \tab     of SAS UNIVARIATE)\cr
describe.default
                \tab Describe a variable (generalization of SAS UNIVARIATE)\cr
do              \tab Assists with batch analyses\cr
dot.chart       \tab Dot chart for one or two classification variables\cr
Dotplot         \tab Enhancement of Trellis dotplot allowing for matrix\cr
                \tab     x-var., auto generation of Key function, superposition\cr
drawPlot        \tab Simple mouse-driven drawing program, including a function\cr
                \tab     for fitting Bezier curves\cr
ecdf            \tab Empirical cumulative distribution function plot\cr
eip             \tab Edit an object "in-place" (may be dangerous!), e.g.\cr
                \tab     eip(sqrt) will replace the builtin sqrt function\cr
errbar          \tab Plot with error bars (Charles Geyer, U. Chi., mod FEH)\cr
event.chart     \tab Plot general event charts (Jack Lee, \email{jjlee@mdanderson.org}, \cr
                \tab     Ken Hess, Joel Dubin; Am Statistician 54:63-70,2000)\cr
event.history	\tab Event history chart with time-dependent cov. status\cr
                \tab     (Joel Dubin, joel.dubin@yale.edu)\cr
find.matches    \tab Find matches (with tolerances) between columns of 2 matrices\cr
first.word      \tab Find the first word in an S expression (R Heiberger)\cr
fit.mult.impute \tab Fit most regression models over multiple transcan imputations,\cr
                \tab     compute imputation-adjusted variances and avg. betas\cr
format.df       \tab Format a matrix or data frame with much user control\cr
                \tab     (R Heiberger and FE Harrell)\cr
ftupwr          \tab Power of 2-sample binomial test using Fleiss, Tytun, Ury\cr
ftuss           \tab Sample size for 2-sample binomial test using  "  "  "  "\cr
                \tab     (Both by Dan Heitjan, \email{dheitjan@biostats.hmc.psu.edu})\cr
gbayes          \tab Bayesian posterior and predictive distributions when both\cr
                \tab the prior and the likelihood are Gaussian\cr
getHdata        \tab Fetch and list datasets on our web site\cr
gs.slide        \tab Sets nice defaults for graph sheets for S-Plus 2000 for\cr
                \tab copying graphs into Microsoft applications\cr
hdquantile      \tab Harrell-Davis nonparametric quantile estimator with s.e.\cr
histbackback    \tab Back-to-back histograms (Pat Burns, Salomon Smith\cr
                \tab     Barney, London, \email{pburns@dorado.sbi.com})\cr
hist.data.frame \tab Matrix of histograms for all numeric vars. in data frame\cr
                \tab     Use hist.data.frame(data.frame.name)\cr
histSpike       \tab Add high-resolution spike histograms or density estimates\cr
                \tab     to an existing plot\cr
hoeffd          \tab Hoeffding's D test (omnibus test of independence of X and Y)\cr
impute          \tab Impute missing data (generic method)\cr
%in%            \tab Find out which elements a are in b : a %in% b\cr
interaction     \tab More flexible version of builtin function\cr
is.present      \tab Tests for non-blank character values or non-NA numeric values\cr
james.stein     \tab James-Stein shrinkage estimates of cell means from raw data\cr
labcurve        \tab Optimally label a set of curves that have been drawn on\cr
                \tab     an existing plot, on the basis of gaps between curves.\cr
                \tab     Also position legends automatically at emptiest rectangle.\cr
label           \tab Set or fetch a label for an S-object\cr
Lag             \tab Lag a vector, padding on the left with NA or ''\cr
latex           \tab Convert an S object to LaTeX (R Heiberger & FE Harrell)\cr
ldBands         \tab Lan-DeMets bands for group sequential tests\cr
list.tree       \tab Pretty-print the structure of any data object\cr
                \tab     (Alan Zaslavsky, \email{zaslavsk@hcp.med.harvard.edu})\cr
mask            \tab 8-bit logical representation of a short integer value\cr
                \tab     (Rick Becker)\cr
matchCases      \tab Match each case on one continuous variable\cr
matxv           \tab Fast matrix * vector, handling intercept(s) and NAs\cr
mem             \tab mem() types quick summary of memory used during session\cr
mgp.axis        \tab Version of axis() that uses appropriate mgp from \cr
                \tab     mgp.axis.labels and gets around bug in axis(2, ...)\cr
                \tab     that causes it to assume las=1\cr
mgp.axis.labels
                \tab Used by survplot and plot in Design library (and other\cr
                \tab     functions in the future) so that different spacing\cr
                \tab     between tick marks and axis tick mark labels may be\cr
                \tab     specified for x- and y-axes.  ps.slide, win.slide,\cr
                \tab     gs.slide set up nice defaults for mgp.axis.labels.\cr
                \tab     Otherwise use mgp.axis.labels('default') to set defaults.\cr
                \tab     Users can set values manually using \cr
                \tab     mgp.axis.labels(x,y) where x and y are 2nd value of\cr
                \tab     par('mgp') to use.  Use mgp.axis.labels(type=w) to\cr
                \tab     retrieve values, where w='x', 'y', 'x and y', 'xy',\cr
                \tab     to get 3 mgp values (first 3 types) or 2 mgp.axis.labels.\cr
minor.tick      \tab Add minor tick marks to an existing plot\cr
mtitle          \tab Add outer titles and subtitles to a multiple plot layout\cr
%nin%           \tab Opposite of %in%\cr
nomiss          \tab Return a matrix after excluding any row with an NA\cr
panel.bpplot    \tab Panel function for trellis bwplot - box-percentile plots\cr
panel.plsmo     \tab Panel function for trellis xyplot - uses plsmo\cr
pc1             \tab Compute first prin. component and get coefficients on\cr\tab     original scale of variables\cr
plotCorrPrecision  \tab Plot precision of estimate of correlation coefficient\cr
plsmo           \tab Plot smoothed x vs. y with labeling and exclusion of NAs\cr
                \tab     Also allows a grouping variable and plots unsmoothed data\cr
popower         \tab Power and sample size calculations for ordinal responses\cr
                \tab     (two treatments, proportional odds model)\cr
prn             \tab prn(expression) does print(expression) but titles the\cr
                \tab     output with 'expression'.  Do prn(expression,txt) to add\cr
                \tab     a heading ('txt') before the 'expression' title\cr
p.sunflowers    \tab Sunflower plots (Andreas Ruckstuhl, Werner Stahel,\cr
                \tab     Martin Maechler, Tim Hesterberg)\cr
ps.slide        \tab Set up postcript() using nice defaults for different types\cr
                \tab     of graphics media\cr
pstamp          \tab Stamp a plot with date in lower right corner (pstamp())\cr
                \tab     Add ,pwd=T and/or ,time=T to add current directory \cr
                \tab      name or time\cr
                \tab     Put additional text for label as first argument, e.g.\cr
                \tab     pstamp('Figure 1')  will draw 'Figure 1  date'\cr
putKey          \tab Different way to use key()\cr
putKeyEmpty     \tab Put key at most empty part of existing plot\cr
rcorr           \tab Pearson or Spearman correlation matrix with pairwise deletion\cr
                \tab     of missing data\cr
rcorr.cens      \tab Somers' Dyx rank correlation with censored data\cr
rcorrp.cens     \tab Assess difference in concordance for paired predictors\cr
rcspline.eval   \tab Evaluate restricted cubic spline design matrix\cr
rcspline.plot   \tab Plot spline fit with nonparametric smooth and grouped estimates\cr
rcspline.restate
                \tab Restate restricted cubic spline in unrestricted form, and\cr
                \tab     create TeX expression to print the fitted function\cr
recode          \tab Recodes variables\cr
reShape         \tab Reshape a matrix into 3 vectors, reshape serial data\cr
rm.boot         \tab Bootstrap spline fit to repeated measurements model,\cr
                \tab     with simultaneous confidence region - least\cr
                \tab     squares using spline function in time\cr
rMultinom       \tab Generate multinomial random variables with varying prob.\cr
samplesize.bin  \tab Sample size for 2-sample binomial problem\cr
                \tab     (Rick Chappell, \email{chappell@stat.wisc.edu})\cr
sas.get         \tab Convert SAS dataset to S data frame\cr
sasxport.get    \tab Enhanced importing of SAS transport dataset in R\cr
scat1d          \tab Add 1-dimensional scatterplot to an axis of an existing plot\cr
                \tab     (like bar-codes, FEH/Martin Maechler, \cr
                \tab     \email{maechler@stat.math.ethz.ch}/Jens Oehlschlaegel-Akiyoshi,\cr
                \tab     \email{oehl@psyres-stuttgart.de})\cr
score.binary    \tab Construct a score from a series of binary variables or\cr
                \tab     expressions\cr
sedit           \tab A set of character handling functions written entirely\cr
                \tab     in S.  sedit() does much of what the UNIX sed\cr
                \tab     program does.  Other functions included are\cr
                \tab     substring.location, substring<-, replace.string.wild,\cr
                \tab     and functions to check if a string is numeric or\cr
                \tab     contains only the digits 0-9\cr
setpdf          \tab Adobe PDF graphics setup for including graphics in books\cr
                \tab     and reports with nice defaults, minimal wasted space\cr
setps           \tab Postscript graphics setup for including graphics in books\cr
                \tab     and reports with nice defaults, minimal wasted space\cr
                \tab     Internally uses psfig function by\cr
                \tab     Antonio Possolo (\email{antonio@atc.boeing.com}).\cr
                \tab     setps works with Ghostscript to convert .ps to .pdf\cr
setTrellis      \tab Set Trellis graphics to use blank conditioning panel strips,\cr
                \tab     line thickness 1 for dot plot reference lines: \cr
                \tab     setTrellis(); 3 optional arguments\cr
show.col        \tab Show colors corresponding to col=0,1,...,99\cr
show.pch        \tab Show all plotting characters specified by pch=.\cr
                \tab     Just type show.pch() to draw the table on the \cr
                \tab     current device.  \cr
showPsfrag      \tab Use LaTeX to compile, and dvips and ghostview to\cr
                \tab    display a postscript graphic containing psfrag strings\cr
solvet          \tab Version of solve with argument tol passed to qr\cr
somers2         \tab Somers' rank correlation and c-index for binary y\cr
spearman        \tab Spearman rank correlation coefficient  spearman(x,y)\cr
spearman.test   \tab Spearman 1 d.f. and 2 d.f. rank correlation test\cr
spearman2       \tab Spearman multiple d.f. \eqn{\rho^2}{rho^2}, adjusted \eqn{\rho^2}{rho^2}, Wilcoxon-Kruskal-\cr
                \tab     Wallis test, for multiple predictors\cr
spower          \tab Simulate power of 2-sample test for survival under\cr
                \tab     complex conditions\cr
                \tab     Also contains the Gompertz2,Weibull2,Lognorm2
functions.\cr
spss.get        \tab Enhanced importing of SPSS files using read.spss
function \cr
src             \tab src(name) = source("name.s") with memory\cr
store           \tab store an object permanently (easy interface to assign function)\cr
strmatch        \tab Shortest unique identifier match \cr
                \tab     (Terry Therneau, \email{therneau@mayo.edu})\cr
subset          \tab More easily subset a data frame\cr
substi          \tab Substitute one var for another when observations NA\cr
summarize       \tab Generate a data frame containing stratified summary\cr
                \tab     statistics.  Useful for passing to trellis.\cr
summary.formula \tab General table making and plotting functions for summarizing\cr
                \tab     data\cr
symbol.freq     \tab X-Y Frequency plot with circles' area prop. to frequency\cr
sys             \tab Execute unix() or dos() depending on what's running\cr
tex             \tab Enclose a string with the correct syntax for using\cr
                \tab    with the LaTeX psfrag package, for postscript graphics\cr
transace        \tab ace() packaged for easily automatically transforming all\cr
                \tab     variables in a matrix\cr
transcan        \tab automatic transformation and imputation of NAs for a\cr
                \tab     series of predictor variables\cr
trap.rule       \tab Area under curve defined by arbitrary x and y vectors,\cr
                \tab using trapezoidal rule\cr
trellis.strip.blank
                \tab To make the strip titles in trellis more visible, you can \cr
                \tab     make the backgrounds blank by saying trellis.strip.blank().\cr
                \tab     Use before opening the graphics device.\cr
t.test.cluster  \tab 2-sample t-test for cluster-randomized observations\cr
uncbind         \tab Form individual variables from a matrix\cr
upData          \tab Update a data frame (change names, labels, remove vars, etc.)\cr
units           \tab Set or fetch "units" attribute - units of measurement for var.\cr
varclus         \tab Graph hierarchical clustering of variables using squared\cr
                \tab Pearson or Spearman correlations or Hoeffding D as similarities\cr
                \tab Also includes the naclus function for examining similarities in\cr
                \tab patterns of missing values across variables.\cr
xy.group        \tab Compute mean x vs. function of y by groups of x\cr
xYplot          \tab Like trellis xyplot but supports error bars and multiple\cr
                \tab     response variables that are connected as separate lines\cr
win.slide       \tab Setup win.graph or win.printer using nice defaults for\cr
                \tab presentations/slides/publications\cr
wtd.mean \tab \cr wtd.var \tab \cr  wtd.quantile \tab \cr  wtd.ecdf \tab
\cr wtd.table \tab \cr  wtd.rank \tab \cr wtd.loess.noiter \tab \cr
num.denom.setup \tab Set of function for obtaining weighted estimates\cr
zoom            \tab Zoom in on any graphical display\cr \tab     (Bill Dunlap, \email{bill@statsci.com})
}}

\section{System Overrides}{
Hmisc overrides the system function model.frame.default
to allow for more elegant handling of NAs by allowing 
the user to specify a global method for handling NAs
using options(na.action='na.methodname').  Hmisc
overrides the system subscripting method for factor
vectors and date vectors, and it defines functions
is.na.dates and is.na.times to check for NAs in date
and time vectors.  The [.factor redefinition by Hmisc
causes by default unused levels to be dropped from the
factor vector's levels attribute when the vector is
subscripted.  This can be overridden by using for example
\code{x <- x[,drop=FALSE]} or by specifying a system option as
follows: \code{options(drop.factor.levels=FALSE)}.\cr

Hmisc also overrides the trelllis shingle function, which
has a bug when its sole argument has a class (such as
the "labelled" class created by the Hmisc label function).
The shingle replacement has the default intervals argument
set to sort(unique(unclass(x))) instead of sort(unique(x)).
}

\references{
See Alzola CF, Harrell FE (2002): An Introduction to S and the
Hmisc and Design Libraries at
\url{http://hesweb1.med.virginia.edu/biostat/s/doc/splus.pdf} for extensive 
documentation and examples for the Hmisc library.
}

\section{Copyright Notice}{
\bold{GENERAL DISCLAIMER}\cr
This program is free software; you can redistribute it
and/or modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 2, or (at your option) any later version.\cr

This program is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE.  See the GNU General Public License for more
details.\cr

In short: You may use it any way you like, as long as you
don't charge money for it, remove this notice, or hold anyone liable
for its results.  Also, please acknowledge the source and communicate
changes to the author.\cr

If this software is used is work presented for publication, kindly
reference it using for example:\cr
 Harrell FE (2003): Hmisc S function library.
  Programs available from \url{http://hesweb1.med.virginia.edu/biostat/s/Hmisc.html}.\cr
  Be sure to reference S-Plus or \R itself and other libraries used.
  }


\section{Acknowledgements}{This work was supported by grants
  from the Agency for Health Care Policy and Research
  (US Public Health Service) and the Robert Wood
  Johnson Foundation.}

\author{
Frank E Harrell Jr\cr
Professor of Biostatistics\cr
Chair, Department of Biostatistics\cr
Vanderbilt University School of Medicine\cr
Nashville, Tennessee\cr
\email{f.harrell@vanderbilt.edu}
}
\keyword{misc}
\concept{overview}

\eof
\name{abs.error.pred}
\alias{abs.error.pred}
\alias{print.abs.error.pred}
\title{
Indexes of Absolute Prediction Error for Linear Models
}
\description{
Computes the mean and median of various absolute errors related to
ordinary multiple regression models.  The mean and median absolute errors 
correspond to the mean square due to regression, error, and total.
The absolute errors computed are derived from Yhat - median(Yhat),
Yhat - Y, and Y - median(Y).  The function also computes ratios that
correspond to Rsquare and 1 - Rsquare (but these ratios do not add to
1.0); the Rsquare measure is the ratio of mean or median absolute Yhat
- median(Yhat) to the mean or median absolute Y - median(Y).  The 1 -
Rsquare or SSE/SST measure is the mean or median absolute Yhat - Y
divided by the mean or median absolute Y - median(Y).
}
\usage{
abs.error.pred(fit, lp=NULL, y=NULL)

\method{print}{abs.error.pred}(x, \dots)
}
\arguments{
\item{fit}{
a fit object typically from \code{lm} or \code{ols} that contains a \code{y} vector
(i.e., you should have specified \code{y=TRUE} to the fitting function) unless
the \code{y} argument is given to \code{abs.error.pred}.  If you do not specify
the \code{lp} argument, \code{fit} must contain \code{fitted.values} or
\code{linear.predictors}.  You must specify \code{fit} or both of \code{lp} and \code{y}.
}
\item{lp}{
a vector of predicted values (Y hat above) if \code{fit} is not given
}
\item{y}{
a vector of response variable values if \code{fit} (with \code{y=TRUE} in effect)
is not given
}
\item{x}{an object created by \code{abs.error.pred}}
\item{\dots}{unused}
}
\value{
a list of class \code{abs.error.pred} (used by \code{print.abs.error.pred})
containing two matrices: \code{differences} and \code{ratios}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{lm}}, \code{\link[Design]{ols}}, \code{\link{cor}}, \code{\link[Design]{validate.ols}}
}
\references{
  Schemper M (2003): Stat in Med 22:2299-2308.
  }
\examples{
set.seed(1)         # so can regenerate results
x1 <- rnorm(100)
x2 <- rnorm(100)
y  <- exp(x1+x2+rnorm(100))
f <- lm(log(y) ~ x1 + poly(x2,3), y=TRUE)
abs.error.pred(lp=exp(fitted(f)), y=y)
rm(x1,x2,y,f)
}
\keyword{robust}
\keyword{regression}
\keyword{models}
\concept{predictive accuracy}

\eof
\name{all.is.numeric}
\alias{all.is.numeric}
\title{Check if All Elements in Character Vector are Numeric}
\description{
Tests, without issuing warnings, whether all elements of a character
vector are legal numeric values, or optionally converts the vector to a
numeric vector.
}
\usage{
all.is.numeric(x, what = c("test", "vector"))
}
\arguments{
  \item{x}{a character vector}
  \item{what}{specify \code{what="vector"} to return a numeric vector if
  it passes the test, or the original character vector otherwise}
}
\value{a logical value if \code{what="test"} or a vector otherwise}
\author{Frank Harrell}
\seealso{\code{\link{as.numeric}}}
\examples{
all.is.numeric(c('1','1.2','3'))
all.is.numeric(c('1','1.2','3a'))
all.is.numeric(c('1','1.2','3'),'vector')
all.is.numeric(c('1','1.2','3a'),'vector')
}
\keyword{character}

\eof
\name{approxExtrap}
\alias{approxExtrap}
\title{Linear Extrapolation}
\description{
Works in conjunction with the \code{approx} function to do linear
extrapolation.  \code{approx} in R does not support extrapolation at
all, and it is buggy in S-Plus 6. 
}
\usage{
approxExtrap(x, y, xout, method = "linear", n = 50, rule = 2, f = 0, ties = "ordered", na.rm = FALSE)
}
\arguments{
  \item{x}{}
  \item{y}{}
  \item{xout}{}
  \item{method}{}
  \item{n}{}
  \item{rule}{}
  \item{f}{see \code{\link{approx}}}
  \item{ties}{applies only to R.  See \code{\link{approx}}}
  \item{na.rm}{set to \code{TRUE} to remove \code{NA}s in \code{x} and
	\code{y} before proceeding}
}
\details{
Duplicates in \code{x} (and corresponding \code{y} elements) are removed
before using \code{approx}.
}
\value{
a vector the same length as \code{xout}
}
\author{Frank Harrell}
\seealso{\code{\link{approx}}}
\examples{
approxExtrap(1:3,1:3,xout=c(0,4))
}
\keyword{arith}
\keyword{dplot}

\eof
\name{aregImpute}
\alias{aregImpute}
\alias{print.aregImpute}
\alias{plot.aregImpute}
\title{
Multiple Imputation using Additive Regression, Bootstrapping, and
Predictive Mean Matching
}
\description{
The \code{transcan} function creates flexible additive imputation models
but provides only an approximation to true multiple imputation as the
imputation models are fixed before all multiple imputations are
drawn.  This ignores variability caused by having to fit the
imputation models.  \code{aregImpute} takes all aspects of uncertainty in
the imputations into account by using the bootstrap to approximate the
process of drawing predicted values from a full Bayesian predictive
distribution.  Different bootstrap resamples are used for each of the
multiple imputations, i.e., for the \code{i}th imputation of a sometimes
missing variable, \code{i=1,2,\dots n.impute}, a flexible additive
model is fitted on a sample with replacement
from the original data and this model is used to predict all of the
original missing and non-missing values for the target variable.


Two methods are used to fit the imputation models, \code{ace} and
\code{avas}.  Unless the identity transformation is specified, these
methods simultaneously find transformations of the target variable and
of all of the predictors, to get a good fit assuming additivity.
\code{ace} maximizes R-squared, and \code{avas} attempts to maximize
R-squared while stabilizing the variance of residuals.  When a
categorical variable is being predicted, only \code{ace} is used.  Like
\code{transcan}s use of canonical regression, this is Fisher's optimum
scoring method for categorical variables.  For continuous variables,
monotonic transformations of the target variable are assumed when
\code{avas} is used.  For \code{ace}, the default allows nonmonotonic
transformations of target variables.  When variables are used as
predictors, the nonparametric transformations derived by \code{ace} or
\code{avas} can be restricted by the user to be monotonic.


Instead of taking random draws from fitted imputation models using
random residuals as is done by \code{transcan}, \code{aregImpute} uses
predictive mean matching with optional weighted probability sampling of
donors rather than using only the closest match.  Predictive mean
matching works for binary, categorical, and continuous variables without
the need for iterative maximum likelihood fitting for binary and
categorical variables, and without the need for computing residuals or
for curtailing imputed values to be in the range of actual data.
Predictive mean matching is especially attractive when the variable
being imputed is also being transformed automatically.  See Details
below for more information about the algorithm.


A \code{print} method summarizes the results, and a \code{plot} method plots
distributions of imputed values.
Typically, \code{fit.mult.impute} will be called after \code{aregImpute}.
}
\usage{
aregImpute(formula, data, subset, n.impute=5, group=NULL,
           method=c('ace','avas'), type=c('pmm','regression'),
           match=c('weighted','closest'), fweighted=0.2,
           defaultLinear=FALSE, x=FALSE, pr=TRUE, plotTrans=FALSE)
\method{print}{aregImpute}(x, \dots)
\method{plot}{aregImpute}(x, nclass=NULL, type=c('ecdf','hist'), 
     diagnostics=FALSE, maxn=10, \dots)
}
\arguments{
\item{formula}{
an S model formula.  You can specify restrictions for transformations
of variables.  The function automatically determines which variables
are categorical (i.e., \code{factor}, \code{category}, or character vectors).
Binary variables are automatically restricted to be linear.  Force
linear transformations of continuous variables by enclosing variables
by the identify function (\code{I()}), and specify monotonicity by using
\code{monotone(variable)}.
}
\item{x}{
  an object created by \code{aregImpute}.  For \code{aregImpute}, set
  \code{x} to \code{TRUE} to save the data matrix containing the final (number
  \code{n.impute}) imputations in the result.  This
  is needed if you want to later do out-of-sample imputation.
  Categorical variables are coded as integers in this matrix.
}
\item{data}{
}
\item{subset}{
These may be also be specified.  You may not specify \code{na.action} as
\code{na.retain} is always used.
}
\item{n.impute}{
number of multiple imputations.  \code{n.impute=5} is frequently
recommended but 10 or more doesn't hurt.
}
\item{group}{a character or factor variable the same length as the
  number of observations in \code{data} and containing no \code{NA}s.
  When \code{group} is present, causes a bootstrap sample of the
  observations corresponding to non-\code{NA}s of a target variable to
  have the same frequency distribution of \code{group} as the
  that in the non-\code{NA}s of the original sample.  This can handle
  k-sample problems as well as lower the chance that a bootstrap sample
  will have a missing cell when the original cell frequency was low.
  }
\item{method}{
method (\code{"ace"}, the default, or \code{"avas"}) for modeling a variable to
be imputed.  As \code{avas} does not allow the response variable to be
categorical, \code{"ace"} is always used for such variables.
}
\item{type}{
  The default is \code{"pmn"} for predictive mean matching,
  which is a more nonparametric approach that will work for categorical
  as well as continuous predictors.  Alternatively, use
  \code{"regression"} when all variables that are sometimes missing are
  continuous and the missingness mechanism is such that entire intervals
  of population values are unobserved.  See the Details section for more
  information.  For the \code{plot} method, 
  specify \code{type="hist"} to draw histograms of imputed values with rug
  plots at the top, or
  \code{type="ecdf"} (the default) to draw empirical CDFs with spike
  histograms at the bottom.
}
\item{match}{
  Defaults to \code{match="weighted"} to do weighted multinomial
  probability sampling using the tricube function (similar to lowess)
  as the weights.  The argument of the tricube function is the absolute
  difference in transformed predicted values of all the donors and of
  the target predicted value, divided by a scaling factor.
  The scaling factor in the tricube function is \code{fweighted} times
  the mean absolute difference between the target predicted value and
  all the possible donor predicted values.  Set \code{match="closest"}
  to find as the donor the observation having the closest predicted
  transformed value, even if that same donor is found repeatedly.}
\item{fweighted}{
  Smoothing parameter (multiple of mean absolute difference) used when
  \code{match="weighted"}, with a default value of 0.2.  Set
  \code{fweighted} to a number between 0.02 and 0.2 to force the donor
  to have a predicted value closer to the target, and set
  \code{fweighted} to larger values (but seldom larger than 1.0) to allow
  donor values to be less tightly matched.  See the examples below to
  learn how to study the relationship between \code{fweighted} and the
  standard deviation of multiple imputations within individuals.}
\item{defaultLinear}{
set to \code{TRUE} to force all continuous variables to be linear in any
model.  This is recommended when the sample size is small.
}
\item{pr}{
set to \code{FALSE} to suppress printing of iteration messages
}
\item{plotTrans}{
  set to \code{TRUE} to plot \code{ace} or \code{avas} transformations
  for each variable for each of the multiple imputations.  This is
  useful for determining whether transformations are reasonable.  If
  transformations are too noisy or have long flat sections (resulting in
  "lumps" in the distribution of imputed values), it may be advisable to
  place restrictions on the transformations (monotonicity or linearity).
  }
\item{nclass}{
number of bins to use in drawing histogram
}
\item{diagnostics}{
Specify \code{diagnostics=TRUE} to draw plots of imputed values against
sequential imputation numbers, separately for each missing
observations and variable. 
}
\item{maxn}{
Maximum number of observations shown for diagnostics.  Default is
\code{maxn=10}, which limits the number of observations plotted to at most
the first 10.
}
\item{...}{
other arguments that are ignored
}}
\value{
a list of class \code{"aregImpute"} containing the following elements:

\item{call}{
the function call expression
}
\item{formula}{
the formula specified to \code{aregImpute}
}
\item{method}{
the \code{method} argument
}
\item{n}{
total number of observations in input dataset
}
\item{p}{
number of variables
}
\item{na}{
list of subscripts of observations for which values were originally missing
}
\item{nna}{
named vector containing the numbers of missing values in the data
}
\item{linear}{
vector of names of variables restricted to be linear
}
\item{categorical}{
vector of names of categorical variables
}
\item{monotone}{
vector of names of variables restricted to be monotonic
}
\item{cat.levels}{
list containing character vectors specifying the \code{levels} of
categorical variables
}
\item{n.impute}{
number of multiple imputations per missing value
}
\item{imputed}{
a list containing matrices of imputed values in the same format as
those created by \code{transcan}.  Categorical variables are coded using
their integer codes.  Variables having no missing values will have
\code{NULL} matrices in the list.
}
\item{rsq}{
for the last round of imputations, a vector containing the R-squares
with which each sometimes-missing variable could be predicted from the
others by \code{ace} or \code{avas}.
}}
\details{
The sequence of steps used by the \code{aregImpute} algorithm is the
following.
\cr
(1) For each variable containing m \code{NA}s where m > 0, initialize the
\code{NA}s to values from a random sample (without replacement if
a sufficient number of non-missing values exist) of size m from the
non-missing values.
\cr
(2) For \code{3+n.impute} iterations do the following steps.  The first 3
iterations provide a burn-in, and imputations are saved only from the last
\code{n.impute} iterations.
\cr
(3) For each variable containing any \code{NA}s, draw a sample with
replacement from the observations in the entire dataset in which the
current variable being imputed is non-missing.  Fit a flexible
additive model to predict this target variable while finding the
optimum transformation of it (unless the identity
transformation is forced).  Use this fitted semiparametric model to
predict the target variable in all of the original observations.
Impute each missing value of the target variable with the observed
value whose predicted transformed value is closest to the predicted
transformed value of the missing value (if \code{match="closest"} and
\code{type="pmm"}), 
or use a draw from a multinomial distribution with probabilities derived
from distance weights, if \code{match="weighted"} (the default).
\cr
(4) After these imputations are computed, use these random draw
imputations the next time the curent target variable is used as a
predictor of other sometimes-missing variables.

When \code{match="closest"}, predictive mean matching does not work well
when fewer than 3 variables are used to predict the target variable,
because many of the multiple imputations for an observation will be
identical.  In the extreme case of one right-hand-side variable and
assuming that only monotonic transformations of left and right-side
variables are allowed, every bootstrap resample will give predicted
values of the target variable that are monotonically related to
predicted values from every other bootstrap resample.  The same is true
for Bayesian predicted values.  This causes predictive mean matching to
always match on the same donor observation.

When the missingness mechanism for a variable is so systematic that the
distribution of observed values is truncated, predictive mean matching
does not work.  It will only yield imputed values that are near
observed values, so intervals in which no values are observed will not
be populated by imputed values.  For this case, the only hope is to make
regression assumptions and use extrapolation.  With
\code{type="regression"}, \code{aregImpute} will use linear
extrapolation to obtain a (hopefully) reasonable distribution of imputed
values.  The \code{"regression"} option causes \code{aregImpute} to
impute missing values by adding a random sample of residuals (with
replacement if there are more \code{NA}s than measured values) on the
scale of the \code{ace} or \code{avas} transformed target variable.
After random residuals are added, predicted random draws are obtained on
the original untransformed scale using reverse linear interpolation on
the table of original and \code{ace} or \code{avas} transformed target
values (linear extrapolation when a random residual is large enough to
put the random draw prediction outside the range of observed values).
The bootstrap is used as with \code{type="pmm"} to factor in the
uncertainty of the imputation model.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{fit.mult.impute}}, \code{\link{transcan}}, \code{\link[acepack]{ace}}, \code{\link{naclus}}, \code{\link{naplot}}, \code{\link[mice]{mice}},
\code{\link{dotchart2}}, \code{\link{ecdf}}
}
\examples{
# Multiple imputation and estimation of variances and covariances of
# regression coefficient estimates accounting for imputation
# Example 1: large sample size, much missing data, no overlap in
# NAs across variables
set.seed(3)
x1 <- factor(sample(c('a','b','c'),1000,TRUE))
x2 <- (x1=='b') + 3*(x1=='c') + rnorm(1000,0,2)
x3 <- rnorm(1000)
y  <- x2 + 1*(x1=='c') + .2*x3 + rnorm(1000,0,2)
orig.x1 <- x1[1:250]
orig.x2 <- x2[251:350]
x1[1:250] <- NA
x2[251:350] <- NA
d <- data.frame(x1,x2,x3,y)

# Use 100 imputations to better check against individual true values
f <- aregImpute(~y + x1 + x2 + x3, n.impute=100, data=d)
f
par(mfrow=c(2,1))
plot(f)
modecat <- function(u) {
 tab <- table(u)
 as.numeric(names(tab)[tab==max(tab)][1])
}
table(orig.x1,apply(f$imputed$x1, 1, modecat))
par(mfrow=c(1,1))
plot(orig.x2, apply(f$imputed$x2, 1, mean))
fmi <- fit.mult.impute(y ~ x1 + x2 + x3, lm, f, 
                       data=d)
sqrt(diag(Varcov(fmi)))
fcc <- lm(y ~ x1 + x2 + x3)
summary(fcc)   # SEs are larger than from mult. imputation


# Example 2: Very discriminating imputation models,
# x1 and x2 have some NAs on the same rows, smaller n
set.seed(5)
x1 <- factor(sample(c('a','b','c'),100,TRUE))
x2 <- (x1=='b') + 3*(x1=='c') + rnorm(100,0,.4)
x3 <- rnorm(100)
y  <- x2 + 1*(x1=='c') + .2*x3 + rnorm(100,0,.4)
orig.x1 <- x1[1:20]
orig.x2 <- x2[18:23]
x1[1:20] <- NA
x2[18:23] <- NA
#x2[21:25] <- NA
d <- data.frame(x1,x2,x3,y)
n <- naclus(d)
plot(n); naplot(n)  # Show patterns of NAs
# 100 imputations to study them; normally use 5 or 10
f  <- aregImpute(~y + x1 + x2 + x3, n.impute=100, defaultLinear=TRUE, data=d)
par(mfrow=c(2,3))
plot(f, diagnostics=TRUE, maxn=2)
# Note: diagnostics=TRUE makes graphs similar to those made by:
# r <- range(f$imputed$x2, orig.x2)
# for(i in 1:6) {  # use 1:2 to mimic maxn=2
#   plot(1:100, f$imputed$x2[i,], ylim=r,
#        ylab=paste("Imputations for Obs.",i))
#   abline(h=orig.x2[i],lty=2)
# }


table(orig.x1,apply(f$imputed$x1, 1, modecat))
par(mfrow=c(1,1))
plot(orig.x2, apply(f$imputed$x2, 1, mean))


fmi <- fit.mult.impute(y ~ x1 + x2, lm, f, 
                       data=d)
sqrt(diag(Varcov(fmi)))
fcc <- lm(y ~ x1 + x2)
summary(fcc)   # SEs are larger than from mult. imputation

# Study relationship between smoothing parameter for weighting function
# (multiplier of mean absolute distance of transformed predicted
# values, used in tricube weighting function) and standard deviation
# of multiple imputations.  SDs are computed from average variances
# across subjects.  match="closest" same as match="weighted" with
# small value of fweighted.
# This example also shows problems with predicted mean
# matching almost always giving the same imputed values when there is
# only one predictor (regression coefficients change over multiple
# imputations but predicted values are virtually 1-1 functions of each
# other)

set.seed(23)
x <- runif(200)
y <- x + runif(200, -.05, .05)
r <- resid(lsfit(x,y))
rmse <- sqrt(sum(r^2)/(200-2))   # sqrt of residual MSE

y[1:20] <- NA
d <- data.frame(x,y)
f <- aregImpute(~ x + y, n.impute=10, match='closest', data=d)
sd <- sqrt(mean(apply(f$imputed$y, 1, var)))

ss <- c(0, .01, .02, seq(.05, 1, length=20))
sds <- ss; sds[1] <- sd

for(i in 2:length(ss)) {
  f <- aregImpute(~ x + y, n.impute=10, fweighted=ss[i])
  sds[i] <- sqrt(mean(apply(f$imputed$y, 1, var)))
}

plot(ss, sds, xlab='Smoothing Parameter', ylab='SD of Imputed Values',
     type='b')
abline(v=.2,  lty=2)  # default value of fweighted
abline(h=rmse, lty=2)  # root MSE of residuals from linear regression

\dontrun{
# Do a similar experiment for the Titanic dataset
getHdata(titanic3)
h <- lm(age ~ sex + pclass + survived, data=titanic3)
rmse <- summary(h)$sigma
set.seed(21)
f <- aregImpute(~ age + sex + pclass + survived, n.impute=10,
                data=titanic3, match='closest')
sd <- sqrt(mean(apply(f$imputed$age, 1, var)))

ss <- c(0, .01, .02, seq(.05, 1, length=20))
sds <- ss; sds[1] <- sd

for(i in 2:length(ss)) {
  f <- aregImpute(~ age + sex + pclass + survived, data=titanic3,
                  n.impute=10, fweighted=ss[i])
  sds[i] <- sqrt(mean(apply(f$imputed$age, 1, var)))
}

plot(ss, sds, xlab='Smoothing Parameter', ylab='SD of Imputed Values',
     type='b')
abline(v=.2,   lty=2)  # default value of fweighted
abline(h=rmse, lty=2)  # root MSE of residuals from linear regression
}
}
\keyword{smooth}
\keyword{regression}
\keyword{multivariate}
\keyword{methods}
\keyword{models}
\concept{bootstrap}
\concept{predictive mean matching}
\concept{imputation}
\concept{NA}

\eof
\name{binconf}
\alias{binconf}
\title{
Confidence Intervals for Binomial Probabilities
}
\description{
Produces 1-alpha confidence intervals for binomial probabilities.
}
\usage{
binconf(x, n, alpha=0.05,
        method=c("wilson","exact","asymptotic","all"),
        include.x=FALSE, include.n=FALSE, return.df=FALSE)
}
\arguments{
\item{x}{
vector containing the number of "successes" for binomial variates
}
\item{n}{
vector containing the numbers of corresponding observations 
}
\item{alpha}{
probability of a type I error, so confidence coefficient = 1-alpha
}
\item{method}{
character string specifing which method to use.  The "all" method only
works when 
x and n are length 1.  The "exact" method uses the F distribution
to compute exact (based on the binomial cdf) intervals; the
"wilson" interval is score-test-based; and the "asymptotic" is the
text-book, asymptotic normal interval.  Following Agresti and
Coull, the Wilson interval is to be preferred and so is the
default.
}
\item{include.x}{
logical flag to indicate whether \code{x} should be included in the
returned matrix or data frame 
}
\item{include.n}{
logical flag to indicate whether \code{n} should be included in the
returned matrix or data frame 
}
\item{return.df}{
logical flag to indicate that a data frame rather than a matrix be
returned
}}
\value{
a matrix or data.frame containing the computed intervals and,
optionally, \code{x} and \code{n}.  
}
\author{
Rollin Brant, Modified by Frank Harrell and
\cr
Brad Biggerstaff
\cr
Centers for Disease Control and Prevention
\cr
National Center for Infectious Diseases
\cr
Division of Vector-Borne Infectious Diseases
\cr
P.O. Box 2087, Fort Collins, CO, 80522-2087, USA
\cr
bkb5@cdc.gov
}
\references{
A. Agresti and B.A. Coull, Approximate is better than "exact" for
interval estimation of binomial proportions,  
\emph{American Statistician,}
\bold{52}:119--126, 1998.


R.G. Newcombe, Logit confidence intervals and the inverse sinh
transformation,
\emph{American Statistician,}
\bold{55}:200--202, 2001.


L.D. Brown, T.T. Cai and A. DasGupta, Interval estimation for
a binomial proportion (with discussion),
\emph{Statistical Science,}
\bold{16}:101--133, 2001.
}
\examples{
binconf(0:10,10,include.x=TRUE,include.n=TRUE)
binconf(46,50,method="all")
}
\keyword{category}
\keyword{htest}
% Converted by Sd2Rd version 1.21.

\eof
\name{bootkm}
\alias{bootkm}
\title{
Bootstrap Kaplan-Meier Estimates
}
\description{
Bootstraps Kaplan-Meier estimate of the probability of survival to at
least a fixed time (\code{times} variable) or the estimate of the \code{q}
quantile of the survival distribution (e.g., median survival time, the
default).
}
\usage{
bootkm(S, q=0.5, B=500, times, pr=TRUE)
}
\arguments{
\item{S}{
a \code{Surv} object for possibly right-censored survival time
}
\item{q}{
quantile of survival time, default is 0.5 for median
}
\item{B}{
number of bootstrap repetitions (default=500)
}
\item{times}{
time vector (currently only a scalar is allowed) at which to compute
survival estimates.  You may specify only one of \code{q} and \code{times}, and
if \code{times} is specified \code{q} is ignored.
}
\item{pr}{
set to \code{FALSE} to suppress printing the iteration number every 10 iterations
}}
\value{
a vector containing \code{B} bootstrap estimates
}
\section{Side Effects}{
updates \code{.Random.seed}, and, if \code{pr=TRUE}, prints progress of simulations
}
\details{
\code{bootkm} uses Therneau's \code{survfit.km} function to efficiently compute
Kaplan-Meier estimates.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Akritas MG (1986): Bootstrapping the Kaplan-Meier estimator.  JASA
81:1032--1038.
}
\seealso{
\code{\link[survival]{survfit}}, \code{\link[survival]{survfit.km}}, \code{\link[survival]{Surv}}, \code{\link[Design]{Survival.cph}}, \code{\link[Design]{Quantile.cph}}
}
\examples{
# Compute 0.95 nonparametric confidence interval for the difference in
# median survival time between females and males (two-sample problem)
set.seed(1)
library(survival)
S <- Surv(runif(200))      # no censoring
sex <- c(rep('female',100),rep('male',100))
med.female <- bootkm(S[sex=='female',], B=100) # normally B=500
med.male   <- bootkm(S[sex=='male',],   B=100)
describe(med.female-med.male)
quantile(med.female-med.male, c(.025,.975), na.rm=TRUE)
# na.rm needed because some bootstrap estimates of median survival
# time may be missing when a bootstrap sample did not include the
# longer survival times
}
\keyword{survival}
\keyword{nonparametric}
\concept{bootstrap}

\eof
\name{bpower}
\alias{bpower}
\alias{bsamsize}
\alias{ballocation}
\alias{bpower.sim}
\title{
Power and Sample Size for Two-Sample Binomial Test
}
\description{
Uses method of Fleiss, Tytun, and Ury (but without the continuity
correction) to estimate the power (or the sample size to achieve a given
power) of a two-sided test for the difference in two proportions.  The two
sample sizes are allowed to be unequal, but for \code{bsamsize} you must specify
the fraction of observations in group 1.  For power calculations, one
probability (\code{p1}) must be given, and either the other probability (\code{p2}),
an \code{odds.ratio}, or a \code{percent.reduction} must be given.  For \code{bpower} or
\code{bsamsize}, any or all of the arguments may be vectors, in which case they
return a vector of powers or sample sizes.  All vector arguments must have
the same length.


Given \code{p1, p2}, \code{ballocation} uses the method of Brittain and Schlesselman
to compute the optimal fraction of observations to be placed in group 1
that either (1) minimize the variance of the difference in two proportions,
(2) minimize the variance of the ratio of the two proportions, 
(3) minimize the variance of the log odds ratio, or
(4) maximize the power of the 2-tailed test for differences.  For (4)
the total sample size must be given, or the fraction optimizing
the power is not returned.  The fraction for (3) is one minus the fraction
for (1).


\code{bpower.sim} estimates power by simulations, in minimal time.  By using
\code{bpower.sim} you can see that the formulas without any continuity correction
are quite accurate, and that the power of a continuity-corrected test
is significantly lower.  That's why no continuity corrections are implemented
here.
}
\usage{
bpower(p1, p2, odds.ratio, percent.reduction, 
       n, n1, n2, alpha=0.05)


bsamsize(p1, p2, fraction=.5, alpha=.05, power=.8)


ballocation(p1, p2, n, alpha=.05)


bpower.sim(p1, p2, odds.ratio, percent.reduction, 
           n, n1, n2, 
           alpha=0.05, nsim=10000)
}
\arguments{
\item{p1}{
population probability in the group 1
}
\item{p2}{
probability for group 2
}
\item{odds.ratio}{
}
\item{percent.reduction}{
}
\item{n}{
total sample size over the two groups.  If you omit this for
\code{ballocation}, the \code{fraction} which optimizes power will not be
returned.
}
\item{n1}{
}
\item{n2}{
the individual group sample sizes.  For \code{bpower}, if \code{n} is given,
\code{n1} and \code{n2} are set to \code{n/2}.
}
\item{alpha}{
type I error
}
\item{fraction}{
fraction of observations in group 1
}
\item{power}{
the desired probability of detecting a difference
}
\item{nsim}{
number of simulations of binomial responses
}}
\value{
for \code{bpower}, the power estimate; for \code{bsamsize}, a vector containing
the sample sizes in the two groups; for \code{ballocation}, a vector with
4 fractions of observations allocated to group 1, optimizing the four
criteria mentioned above.  For \code{bpower.sim}, a vector with three
elements is returned, corresponding to the simulated power and its
lower and upper 0.95 confidence limits.
}
\details{
For \code{bpower.sim}, all arguments must be of length one.
}
\section{AUTHOR}{
Frank Harrell


Department of Biostatistics


Vanderbilt University


f.harrell@vanderbilt.edu
}
\references{
Fleiss JL, Tytun A, Ury HK (1980): A simple approximation for calculating
sample sizes for comparing independent proportions.  Biometrics 36:343--6.


Brittain E, Schlesselman JJ (1982): Optimal allocation for the comparison
of proportions.  Biometrics 38:1003--9.


Gordon I, Watson R (1996): The myth of continuity-corrected sample size
formulae.  Biometrics 52:71--6.
}
\seealso{
\code{\link{samplesize.bin}}, \code{\link{chisq.test}}, \code{\link{binconf}}
}
\examples{
bpower(.1, odds.ratio=.9, n=1000, alpha=c(.01,.05))
bpower.sim(.1, odds.ratio=.9, n=1000)
bsamsize(.1, .05, power=.95)
ballocation(.1, .5, n=100)


# Plot power vs. n for various odds ratios  (base prob.=.1)
n  <- seq(10, 1000, by=10)
OR <- seq(.2,.9,by=.1)
plot(0, 0, xlim=range(n), ylim=c(0,1), xlab="n", ylab="Power", type="n")
for(or in OR) {
  lines(n, bpower(.1, odds.ratio=or, n=n))
  text(350, bpower(.1, odds.ratio=or, n=350)-.02, format(or))
}


# Another way to plot the same curves, but letting labcurve do the
# work, including labeling each curve at points of maximum separation
pow <- lapply(OR, function(or,n)list(x=n,y=bpower(p1=.1,odds.ratio=or,n=n)),
              n=n)
names(pow) <- format(OR)
labcurve(pow, pl=TRUE, xlab='n', ylab='Power')


# Contour graph for various probabilities of outcome in the control
# group, fixing the odds ratio at .8 ([p2/(1-p2) / p1/(1-p1)] = .8)
# n is varied also
p1 <- seq(.01,.99,by=.01)
n  <- seq(100,5000,by=250)
pow <- outer(p1, n, function(p1,n) bpower(p1, n=n, odds.ratio=.8))
# This forms a length(p1)*length(n) matrix of power estimates
contour(p1, n, pow)
}
\keyword{htest}
\keyword{category}
\concept{power}
\concept{study design}

\eof
\name{bpplot}
\alias{bpplot}
\title{
Box-percentile plots
}
\description{
Producess side-by-side box-percentile plots from several vectors or a
list of vectors.  
}
\usage{
bpplot(\dots, name=TRUE, main="Box-Percentile Plot", 
       xlab="", ylab="", srtx=0)
}
\arguments{
\item{...}{
vectors or lists containing 
numeric components (e.g., the output of \code{split}).
}
\item{name}{
character vector of names for the groups.  
Default is \code{TRUE} to put names on the x-axis.  Such names are taken from the 
data vectors or the \code{names} attribute of the first argument if it is a list.
Set \code{name} to \code{FALSE} to suppress names.
If a character vector is supplied the names in the vector are
used to label the groups.
}
\item{main}{
main title for the plot.
}
\item{xlab}{
x axis label.
}
\item{ylab}{
y axis label.
}
\item{srtx}{
rotation angle for x-axis labels.  Default is zero.
}}
\value{
There are no returned values
}
\section{Side Effects}{
A plot is created on the current graphics device.
}
\section{BACKGROUND}{
Box-percentile plots are similiar to boxplots, except box-percentile plots
supply more information about the univariate distributions.  At any height
the width of the irregular "box" is proportional to the percentile of that
height, up to the 50th percentile, and above the 50th percentile the width
is proportional to 100 minus the percentile.  Thus, the width at any given
height is proportional to the percent of observations that are more 
extreme in that direction.  As in boxplots, the median, 25th and 75th 
percentiles are marked with line segments across the box.
}
\author{
Jeffrey Banfield
\cr
umsfjban@bill.oscs.montana.edu
\cr
Modified by F. Harrell 30Jun97
}
\references{
Esty, W. W. and Banfield, J. D. (1992)
"The Box-Percentile Plot,"
Technical Report (May 15, 1992),
Department of Mathematical Sciences,
Montana State University.
}
\seealso{
\code{\link{panel.bpplot}}, \code{\link{boxplot}}, \code{\link{ecdf}},
\code{\link{bwplot}} 
}
\examples{
set.seed(1)
x1 <- rnorm(500)
x2 <- runif(500, -2, 2)
x3 <- abs(rnorm(500))-2
bpplot(x1, x2, x3)
g <- sample(1:2, 500, replace=TRUE)
bpplot(split(x2, g), name=c('Group 1','Group 2'))
rm(x1,x2,x3,g)
}
\keyword{nonparametric}
\keyword{hplot}
% Converted by Sd2Rd version 1.21.

\eof
\name{bystats}
\alias{bystats}
\alias{print.bystats}
\alias{latex.bystats}
\alias{bystats2}
\alias{print.bystats2}
\alias{latex.bystats2}
\title{
Statistics by Categories
}
\description{

  For any number of cross-classification variables, \code{bystats}
  returns a matrix with the sample size, number missing \code{y}, and
  \code{fun(non-missing y)}, with the cross-classifications designated
  by rows. Uses Harrell's modification of the \code{interaction}
  function to produce cross-classifications.  The default \code{fun} is
  \code{mean}, and if \code{y} is binary, the mean is labeled as
  \code{Fraction}.  There is a \code{print} method as well as a
  \code{latex} method for objects created by \code{bystats}.
  \code{bystats2} handles the special case in which there are 2
  classifcation variables, and places the first one in rows and the
  second in columns.  The \code{print} method for \code{bystats2} uses
  the S-Plus \code{print.char.matrix} function to organize statistics
  for cells into boxes. }

\usage{
bystats(y, \dots, fun, nmiss, subset)
\method{print}{bystats}(x, \dots)
\method{latex}{bystats}(object, title, caption, rowlabel, \dots)
bystats2(y, v, h, fun, nmiss, subset)
\method{print}{bystats2}(x, abbreviate.dimnames=FALSE,
   prefix.width=max(nchar(dimnames(x)[[1]])), \dots)
\method{latex}{bystats2}(object, title, caption, rowlabel, \dots)
}
\arguments{
\item{y}{
a binary, logical, or continuous variable or a matrix or data frame of
such variables.  If \code{y} is a data frame it is converted to a matrix.
If \code{y} is a data frame or matrix, computations are done on subsets of
the rows of \code{y}, and you should specify \code{fun} so as to be able to operate
on the matrix.  For matrix \code{y}, any column with a missing value causes
the entire row to be considered missing, and the row is not passed to
\code{fun}.
}
\item{...}{
For \code{bystats}, one or more classifcation variables separated by commas.
For \code{print.bystats}, options passed to \code{print.default} such as \code{digits}.
For \code{latex.bystats}, and \code{latex.bystats2},
options passed to \code{latex.default} such as \code{digits}.
If you pass \code{cdec} to \code{latex.default}, keep in mind that the first one or
two positions (depending on \code{nmiss}) should have zeros since these
correspond with frequency counts. 
}
\item{v}{
vertical variable for \code{bystats2}.  Will be converted to \code{factor}.
}
\item{h}{
horizontal variable for \code{bystats2}.  Will be converted to \code{factor}.
}
\item{fun}{
a function to compute on the non-missing \code{y} for a given subset.
You must specify \code{fun=} in front of the function name or definition.
\code{fun} may return a single number or a vector or matrix of any length.
Matrix results are rolled out into a vector, with names preserved.
When \code{y} is a matrix, a common \code{fun} is \code{function(y) apply(y, 2, ff)}
where \code{ff} is the name of a function which operates on one column of
\code{y}.
}
\item{nmiss}{
A column containing a count of missing values is included if \code{nmiss=TRUE}
or if there is at least one missing value.
}
\item{subset}{
a vector of subscripts or logical values indicating the subset of
data to analyze
}
\item{abbreviate.dimnames}{set to \code{TRUE} to abbreviate
  \code{dimnames} in output}
\item{prefix.width}{see \code{\link{print.char.matrix}} if using S-Plus}
\item{title}{
\code{title} to pass to \code{latex.default}.  Default is the first word of
the character string version of the first calling argument.
}
\item{caption}{
caption to pass to \code{latex.default}.  Default is the \code{heading}
attribute from the object produced by \code{bystats}.
}
\item{rowlabel}{
\code{rowlabel} to pass to \code{latex.default}.  Default is the \code{byvarnames}
attribute from the object produced by \code{bystats}.  For \code{bystats2} the
default is \code{""}.
}
\item{x}{an object created by \code{bystats} or \code{bystats2}}
\item{object}{an object created by \code{bystats} or \code{bystats2}}
}
\value{
for \code{bystats}, a matrix with row names equal to the classification labels and column
names \code{N, Missing, funlab}, where \code{funlab} is determined from \code{fun}.
A row is added to the end with the summary statistics computed 
on all observations combined.  The class of this matrix is \code{bystats}.
For \code{bystats}, returns a 3-dimensional array with the last dimension
corresponding to statistics being computed.  The class of the array is
\code{bystats2}.
}
\section{Side Effects}{
\code{latex} produces a \code{.tex} file.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{interaction}}, \code{\link{cut}}, \code{\link{cut2}}, \code{\link{latex}}, \code{\link{print.char.matrix}},
\code{\link{translate}}
}
\examples{
\dontrun{
bystats(sex==2, county, city)
bystats(death, race)
bystats(death, cut2(age,g=5), race)
bystats(cholesterol, cut2(age,g=4), sex, fun=median)
bystats(cholesterol, sex, fun=quantile)
bystats(cholesterol, sex, fun=function(x)c(Mean=mean(x),Median=median(x)))
latex(bystats(death,race,nmiss=FALSE,subset=sex=="female"), digits=2)
f <- function(y) c(Hazard=sum(y[,2])/sum(y[,1]))
# f() gets the hazard estimate for right-censored data from exponential dist.
bystats(cbind(d.time, death), race, sex, fun=f)
bystats(cbind(pressure, cholesterol), age.decile, 
        fun=function(y) c(Median.pressure   =median(y[,1]),
                          Median.cholesterol=median(y[,2])))
y <- cbind(pressure, cholesterol)
bystats(y, age.decile, 
        fun=function(y) apply(y, 2, median))   # same result as last one
bystats(y, age.decile, fun=function(y) apply(y, 2, quantile, c(.25,.75)))
# The last one computes separately the 0.25 and 0.75 quantiles of 2 vars.
latex(bystats2(death, race, sex, fun=table))
}
}
\keyword{category}
\concept{grouping}

\eof
\name{ciapower}
\alias{ciapower}
\title{
Power of Interaction Test for Exponential Survival
}
\description{
Uses the method of Peterson and George to compute the power of an
interaction test in a 2 x 2 setup in which all 4 distributions are
exponential.  This will be the same as the power of the Cox model
test if assumptions hold.  The test is 2-tailed.  
The duration of accrual is specified
(constant accrual is assumed), as is the minimum follow-up time.
The maximum follow-up time is then \code{accrual + tmin}.  Treatment
allocation is assumed to be 1:1.
}
\usage{
ciapower(tref, n1, n2, m1c, m2c, r1, r2, accrual, tmin, 
         alpha=0.05, pr=TRUE)
}
\arguments{
\item{tref}{
time at which mortalities estimated
}
\item{n1}{
total sample size, stratum 1
}
\item{n2}{
total sample size, stratum 2
}
\item{m1c}{
tref-year mortality, stratum 1 control
}
\item{m2c}{
tref-year mortality, stratum 2 control
}
\item{r1}{
\% reduction in \code{m1c} by intervention, stratum 1
}
\item{r2}{
\% reduction in \code{m2c} by intervention, stratum 2
}
\item{accrual}{
duration of accrual period
}
\item{tmin}{
minimum follow-up time
}
\item{alpha}{
type I error probability
}
\item{pr}{
set to \code{FALSE} to suppress printing of details
}}
\value{
power
}
\section{Side Effects}{
prints
}
\section{AUTHOR}{
Frank Harrell


Department of Biostatistics


Vanderbilt University


f.harrell@vanderbilt.edu
}
\references{
Peterson B, George SL: Controlled Clinical Trials 14:511--522; 1993.
}
\seealso{
\code{\link{cpower}}, \code{\link{spower}}
}
\examples{
# Find the power of a race x treatment test.  25\% of patients will
# be non-white and the total sample size is 14000.  
# Accrual is for 1.5 years and minimum follow-up is 5y.
# Reduction in 5-year mortality is 15\% for whites, 0\% or -5\% for
# non-whites.  5-year mortality for control subjects if assumed to
# be 0.18 for whites, 0.23 for non-whites.
n <- 14000
for(nonwhite.reduction in c(0,-5)) {
  cat("\n\n\n\% Reduction in 5-year mortality for non-whites:",
      nonwhite.reduction, "\n\n")
  pow <- ciapower(5,  .75*n, .25*n,  .18, .23,  15, nonwhite.reduction,  
                  1.5, 5)
  cat("\n\nPower:",format(pow),"\n")
}
}
\keyword{survival}
\keyword{htest}
\concept{power}
\concept{study design}

\eof
\name{contents}
\alias{contents}
\alias{contents.data.frame}
\alias{print.contents.data.frame}
\alias{html.contents.data.frame}
\alias{contents.list}
\alias{print.contents.list}
\title{
Metadata for a Data Frame
}
\description{
\code{contents} is a generic method for which \code{contents.data.frame}
is currently the only method.  \code{contents.data.frame} creates an
object containing the following attributes of the variables 
from a data frame: names, labels (if any), units (if any), number of
factor levels (if any), factor levels,
class, storage mode, and number of NAs.  \code{print.contents.data.frame}
will print the results, with options for sorting the variables.
\code{html.contents.data.frame} creates HTML code for displaying the
results.  This code has hyperlinks so that if the user clicks on the
number of levels the browser jumps to the correct part of a table of
factor levels for all the \code{factor} variables.

\code{contents.list} prints a directory of datasets when
\code{\link{sasxport.get}} imported more than one SAS dataset.
}
\usage{
contents(object, \dots)
\method{contents}{data.frame}(object, \dots)
\method{print}{contents.data.frame}(x,
    sort=c('none','names','labels','NAs'), prlevels=TRUE, \dots) 
\method{html}{contents.data.frame}(object,  sort=c('none','names','labels','NAs'), prlevels=TRUE,
           file=paste('contents',object$dfname,'html',sep='.'),
           append=FALSE, \dots)
\method{contents}{list}(object, dslabels, \dots)
\method{print}{contents.list}(x,
    sort=c('none','names','labels','NAs','vars'), \dots)
}
\arguments{
  \item{object}{a data frame.  For \code{html} is an object created by
	\code{contents}.  For \code{contents.list} is a list of data frames.}
  \item{x}{an object created by \code{contents}}
  \item{sort}{
	Default is to print the variables in their original order in the
    data frame.  Specify one of 
	\code{"names"}, \code{"labels"}, or \code{"NAs"} to sort the variables by,
	respectively, alphabetically by names, alphabetically by labels, or by
	increaseing order of number of missing values.  For
	\code{contents.list}, \code{sort} may also be the value
	\code{"vars"} to cause sorting by the number of variables in the dataset.
  }
  \item{prlevels}{
	set to \code{FALSE} to not print all levels of \code{factor} variables
  }
  \item{file}{file to which to write the html code.  Default is
   \code{"conents.dfname.html"} where \code{dfname} is the name of the data
   frame processed by \code{contents}.}
 \item{append}{set to \code{TRUE} to add html code to an existing file}
 \item{\dots}{arguments passed from \code{html} to \code{format.df},
unused otherwise}
 \item{dslabels}{named vector of SAS dataset labels, created for
    example by \code{\link{sasdsLabels}}}
}
\value{an object of class \code{"contents.data.frame"} or
\code{"contents.list"}}

\author{
Frank Harrell
\cr
Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\seealso{
  \code{\link{describe}}, \code{\link{html}}
}
\examples{
set.seed(1)
dfr <- data.frame(x=rnorm(400),y=sample(c('male','female'),400,TRUE))
contents(dfr)
k <- contents(dfr)
print(k, sort='names', prlevels=FALSE)
\dontrun{
html(k)
html(contents(dfr))            # same result
w <- html(k, file='my.html')   # create my.html, don't display
}
}
\keyword{data}
\keyword{interface}


\eof
\name{cpower}
\alias{cpower}
\title{
Power of Cox/log-rank Two-Sample Test
}
\description{
Assumes exponential distributions for both treatment groups.
Uses the George-Desu method along with
formulas of Schoenfeld that allow estimation of the expected number of
events in the two groups.  
To allow for drop-ins (noncompliance to control therapy, crossover to
intervention) and noncompliance of the intervention, the method of
Lachin and Foulkes is used.
}
\usage{
cpower(tref, n, mc, r, accrual, tmin, noncomp.c=0, noncomp.i=0, 
       alpha=0.05, nc, ni, pr=TRUE)
}
\arguments{
\item{tref}{
time at which mortalities estimated
}
\item{n}{
total sample size (both groups combined).  If allocation is unequal
so that there are not \code{n/2} observations in each group, you may specify
the sample sizes in \code{nc} and \code{ni}.
}
\item{mc}{
tref-year mortality, control
}
\item{r}{
\% reduction in \code{mc} by intervention
}
\item{accrual}{
duration of accrual period
}
\item{tmin}{
minimum follow-up time
}
\item{noncomp.c}{
\% non-compliant in control group (drop-ins)
}
\item{noncomp.i}{
\% non-compliant in intervention group (non-adherers)
}
\item{alpha}{
type I error probability.  A 2-tailed test is assumed.
}
\item{nc}{
number of subjects in control group
}
\item{ni}{
number of subjects in intervention group.  \code{nc} and \code{ni} are specified
exclusive of \code{n}.
}
\item{pr}{
set to \code{FALSE} to suppress printing of details
}}
\value{
power
}
\section{Side Effects}{
prints
}
\details{
For handling noncompliance, uses a modification of formula (5.4) of
Lachin and Foulkes.  Their method is based on a test for the difference
in two hazard rates, whereas \code{cpower} is based on testing the difference
in two log hazards.  It is assumed here that the same correction factor
can be approximately applied to the log hazard ratio as Lachin and Foulkes applied to
the hazard difference.


Note that Schoenfeld approximates the variance
of the log hazard ratio by \code{4/m}, where \code{m} is the total number of events,
whereas the George-Desu method uses the slightly better \code{1/m1 + 1/m2}.
Power from this function will thus differ slightly from that obtained with
the SAS \code{samsizc} program.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\references{
Peterson B, George SL: Controlled Clinical Trials 14:511--522; 1993.


Lachin JM, Foulkes MA: Biometrics 42:507--519; 1986.


Schoenfeld D: Biometrics 39:499--503; 1983.
}
\seealso{
\code{\link{spower}}, \code{\link{ciapower}}, \code{\link{bpower}}
}
\examples{
#In this example, 4 plots are drawn on one page, one plot for each
#combination of noncompliance percentage.  Within a plot, the
#5-year mortality \% in the control group is on the x-axis, and
#separate curves are drawn for several \% reductions in mortality
#with the intervention.  The accrual period is 1.5y, with all
#patients followed at least 5y and some 6.5y.


par(mfrow=c(2,2),oma=c(3,0,3,0))


morts <- seq(10,25,length=50)
red <- c(10,15,20,25)


for(noncomp in c(0,10,15,-1)) {
  if(noncomp>=0) nc.i <- nc.c <- noncomp else {nc.i <- 25; nc.c <- 15}
  z <- paste("Drop-in ",nc.c,"\%, Non-adherence ",nc.i,"\%",sep="")
  plot(0,0,xlim=range(morts),ylim=c(0,1),
           xlab="5-year Mortality in Control Patients (\%)",
           ylab="Power",type="n")
  title(z)
  cat(z,"\n")
  lty <- 0
  for(r in red) {
        lty <- lty+1
        power <- morts
        i <- 0
        for(m in morts) {
          i <- i+1
          power[i] <- cpower(5, 14000, m/100, r, 1.5, 5, nc.c, nc.i, pr=FALSE)
        }
        lines(morts, power, lty=lty)
  }
  if(noncomp==0)legend(18,.55,rev(paste(red,"\% reduction",sep="")),
           lty=4:1,bty="n")
}
mtitle("Power vs Non-Adherence for Main Comparison",
           ll="alpha=.05, 2-tailed, Total N=14000",cex.l=.8)
#
# Point sample size requirement vs. mortality reduction
# Root finder (uniroot()) assumes needed sample size is between
# 1000 and 40000
#
nc.i <- 25; nc.c <- 15; mort <- .18
red <- seq(10,25,by=.25)
samsiz <- red


i <- 0
for(r in red) {
  i <- i+1
  samsiz[i] <- uniroot(function(x) cpower(5, x, mort, r, 1.5, 5,
                                          nc.c, nc.i, pr=FALSE) - .8,
                       c(1000,40000))$root
}


samsiz <- samsiz/1000
par(mfrow=c(1,1))
plot(red, samsiz, xlab='\% Reduction in 5-Year Mortality',
	 ylab='Total Sample Size (Thousands)', type='n')
lines(red, samsiz, lwd=2)
title('Sample Size for Power=0.80\nDrop-in 15\%, Non-adherence 25\%')
title(sub='alpha=0.05, 2-tailed', adj=0)
}
\keyword{htest}
\keyword{survival}
\concept{power}
\concept{study design}

\eof
\name{cut2}
\alias{cut2}
\title{
Cut a Numeric Variable into Intervals
}
\description{
Function like cut but left endpoints are inclusive and labels are of
the form \code{[lower, upper)}, except that last interval is \code{[lower,upper]}.  
If cuts are given, will by default make sure that cuts include entire
range of \code{x}.
Also, if cuts are not given, will cut \code{x} into quantile groups 
(\code{g} given) or groups
with a given minimum number of observations (\code{m}).  Whereas cut creates a
category object, \code{cut2} creates a factor object.
}
\usage{
cut2(x, cuts, m, g, levels.mean, digits, minmax=TRUE, oneval=TRUE)
}
\arguments{
\item{x}{
numeric vector to classify into intervals
}
\item{cuts}{
cut points
}
\item{m}{
desired minimum number of observations in a group
}
\item{g}{
number of quantile groups
}
\item{levels.mean}{
set to \code{TRUE} to make the new categorical vector have levels attribute that is
the group means of \code{x} instead of interval endpoint labels
}
\item{digits}{
number of significant digits to use in constructing levels.  Default is 3
(5 if \code{levels.mean=TRUE})
}
\item{minmax}{
if cuts is specified but \code{min(x)<min(cuts)} or \code{max(x)>max(cuts)}, augments
cuts to include min and max \code{x}
}
\item{oneval}{
if an interval contains only one unique value, the interval will be
labeled with the formatted version of that value instead of the
interval endpoints, unless \code{oneval=FALSE}
}}
\value{
a factor variable with levels of the form \code{[a,b)} or formatted means
(character strings)
}
\seealso{
\code{\link{cut}}, \code{\link{quantile}}
}
\examples{
set.seed(1)
x <- runif(1000, 0, 100)
z <- cut2(x, c(10,20,30))
table(z)
table(cut2(x, g=10))      # quantile groups
table(cut2(x, m=50))      # group x into intevals with at least 50 obs.
}
\keyword{category}
\keyword{nonparametric}
\concept{grouping}
\concept{categorization}
\concept{discretization}

\eof
\name{data.frame.create.modify.check}
\alias{data.frame.create.modify.check}
\title{
Tips for Creating, Modifying, and Checking Data Frames
}
\description{
This help file contains a template for importing data to create an
S-Plus data frame, correcting some problems resulting from the import
and making the data frame be stored more efficiently, modifying the
data frame (including better annotating it and changing the names of
some of its variables), and checking and inspecting the data frame for
reasonableness of the values of its variables and to describe patterns
of missing data.  Various built-in functions and functions in the
Hmisc library are used.  At the end some methods for creating data
frames "from scratch" within S-Plus are presented.


The examples below attempt to clarify the separation of operations
that are done on a data frame as a whole, operations that are done on
a small subset of its variables without attaching the whole data
frame, and operations that are done on many variables after attaching
the data frame in search position one.  It also tries to clarify that
for analyzing several separate variables using S-Plus commands that do
not support a \code{data=} argument, it is helpful to attach the data frame
in a search position later than position one.


It is often useful to create, modify, and process datasets in the
following order.
\cr
- Import external data into a data frame (if the raw data do not
contain column names, provide these during the import if possible)
\cr
- Make global changes to a data frame (e.g., changing variable names)
\cr
- Change attributes or values of variables within a data frame
\cr
- Do analyses involving the whole data frame (without attaching it)
\cr
  (Data frame still in .Data)
\cr
- Do analyses of individual variables (after attaching the data frame
in search position two or later)
}
\details{
The examples below use the \code{FEV} dataset from Rosner B (1995):
\emph{Fundamentals of Biostatistics, 4th Edition.  }
New York: Duxbury Press.
Almost any dataset would do.  The jcetable data are taken from
Galobardes, et al. (1998), 
\emph{J Clin Epi}
51:875-881.


Presently, giving a variable the \code{"units"} attribute (using the Hmisc
\code{units} function) only benefits the Hmisc \code{describe} function and the Design
library's version of the \code{Surv} function.  Variables labels defined
with the Hmisc \code{label} function are used by \code{describe},
\code{summary.formula},  and many of the plotting functions in Hmisc and Design.
}
\references{
Alzola CF, Harrell FE (2001):
\emph{An Introduction to S-Plus and the Hmisc and Design Libraries.}
Chapters 3 and 4,
hesweb1.med.virginia.edu/biostat/s/doc/splus.pdf.
}
\seealso{
\code{\link{scan}}, \code{\link{read.table}}, \code{\link{cleanup.import}}, \code{\link{sas.get}}, \code{\link{data.frame}},
\code{\link{attach}}, \code{\link{detach}},
\code{\link{describe}}, \code{\link{datadensity}}, \code{\link{plot.data.frame}}, \code{\link{hist.data.frame}},
\code{\link{naclus}}, \code{\link{factor}}, \code{\link{label}}, \code{\link{units}}, \code{\link{names}}, \code{\link{expand.grid}},
\code{\link{summary.formula}}, \code{\link{summary.data.frame}}, \code{\link{casefold}}, \code{\link{edit}}, \code{\link{page}},
\code{\link{plot.data.frame}}, \code{\link{Cs}}, \code{\link{combine.levels}}, \code{\link{upData}}
}
\examples{
\dontrun{
# First, we do steps that create or manipulate the data
# frame in its entirety.  These are done with .Data
# in search position one (the S-Plus default at the
# start of the session).
#
# -----------------------------------------------------------------------
# Step 1: Create initial draft of data frame
# 
# We usually begin by importing a dataset from
# # another application.  ASCII files may be imported
# using the scan and read.table functions.  SAS
# datasets may be imported using the Hmisc sas.get
# function (which will carry more attributes from
# SAS than using File \dots  Import) from the GUI
# menus.  But for most applications (especially
# Excel), File \dots Import will suffice.  If using
# the GUI, it is often best to provide variable
# names during the import process, using the Options
# tab, rather than renaming all fields later Of
# course, if the data to be imported already have
# field names (e.g., in Excel), let S-Plus use those
# automatically.  If using S-Plus 4.x on Windows/NT,
# you can use a command to execute File \dots  Import,
# e.g.:


import.data(FileName = "/windows/temp/fev.asc",
            FileType = "ASCII", DataFrame = "FEV")


# Here we name the new data frame FEV rather than
# fev, because we wanted to distinguish a variable
# in the data frame named fev from the data frame
# name.  For S-Plus 6.x the command will look
# instead like the following:


FEV <- importData("/tmp/fev.asc")




# -----------------------------------------------------------------------
# Step 2: Clean up data frame / make it be more
# efficiently stored
# 
# Unless using sas.get to import your dataset
# (sas.get already stores data efficiently), it is
# usually a good idea to run the data frame through
# the Hmisc cleanup.import function to change
# numeric variables that are always whole numbers to
# be stored as integers, the remaining numerics to
# single precision, strange values from Excel to
# NAs, and character variables that always contain
# legal numeric values to numeric variables.
# cleanup.import typically halves the size of the
# data frame.  If you do not specify any parameters
# to cleanup.import, the function assumes that no
# numeric variable needs more than 7 significant
# digits of precision, so all non-integer-valued
# variables will be converted to single precision.


FEV <- cleanup.import(FEV)




# -----------------------------------------------------------------------
# Step 3: Make global changes to the data frame
# 
# A data frame has attributes that are "external" to
# its variables.  There are the vector of its
# variable names ("names" attribute), the
# observation identifiers ("row.names"), and the
# "class" (whose value is "data.frame").  The
# "names" attribute is the one most commonly in need
# of modification.  If we had wanted to change all
# the variable names to lower case, we could have
# specified lowernames=TRUE to the cleanup.import
# invocation above, or type


names(FEV) <- casefold(names(FEV))


# The upData function can also be used to change
# variable names in two ways (see below).
# To change names in a non-systematic way we use
# other options.  Under Windows/NT the most
# straigtforward approach is to change the names
# interactively.  Click on the data frame in the
# left panel of the Object Browser, then in the
# right pane click twice (slowly) on a variable.
# Use the left arrow and other keys to edit the
# name.  Click outside that name field to commit the
# change.  You can also rename columns while in a
# Data Sheet.  To instead use programming commands
# to change names, use something like:


names(FEV)[6] <- 'smoke'   # assumes you know the positions!  
names(FEV)[names(FEV)=='smoking'] <- 'smoke' 
names(FEV) <- edit(names(FEV))


# The last example is useful if you are changing
# many names.  But none of the interactive
# approaches such as edit() are handy if you will be
# re-importing the dataset after it is updated in
# its original application.  This problem can be
# addressed by saving the new names in a permanent
# vector in .Data:


new.names <- names(FEV)


# Then if the data are re-imported, you can type


names(FEV) <- new.names


# to rename the variables.




# -----------------------------------------------------------------------
# Step 4: Delete unneeded variables
# 
# To delete some of the variables, you can
# right-click on variable names in the Object
# Browser's right pane, then select Delete.  You can
# also set variables to have NULL values, which
# causes the system to delete them.  We don't need
# to delete any variables from FEV but suppose we
# did need to delete some from mydframe.


mydframe$x1 <- NULL 
mydframe$x2 <- NULL
mydframe[c('age','sex')] <- NULL   # delete 2 variables 
mydframe[Cs(age,sex)]    <- NULL   # same thing


# The last example uses the Hmisc short-cut quoting
# function Cs.  See also the drop parameter to upData.




# -----------------------------------------------------------------------
# Step 5: Make changes to individual variables
#         within the data frame
# 
# After importing data, the resulting variables are
# seldom self - documenting, so we commonly need to
# change or enhance attributes of individual
# variables within the data frame.
# 
# If you are only changing a few variables, it is
# efficient to change them directly without
# attaching the entire data frame.


FEV$sex   <- factor(FEV$sex,   0:1, c('female','male')) 
FEV$smoke <- factor(FEV$smoke, 0:1, 
                    c('non-current smoker','current smoker')) 
units(FEV$age)    <- 'years'
units(FEV$fev)    <- 'L' 
label(FEV$fev)    <- 'Forced Expiratory Volume' 
units(FEV$height) <- 'inches'


# When changing more than one or two variables it is
# more convenient change the data frame using the
# Hmisc upData function.


FEV2 <- upData(FEV,
  rename=c(smoking='smoke'), 
  # omit if renamed above
  drop=c('var1','var2'),
  levels=list(sex  =list(female=0,male=1),
              smoke=list('non-current smoker'=0,
                         'current smoker'=1)),
  units=list(age='years', fev='L', height='inches'),
  labels=list(fev='Forced Expiratory Volume'))


# An alternative to levels=list(\dots) is for example
# upData(FEV, sex=factor(sex,0:1,c('female','male'))).
# 
# Note that we saved the changed data frame into a
# new data frame FEV2.  If we were confident of the
# correctness of our changes we could have stored
# the new data frame on top of the old one, under
# the original name FEV.


# -----------------------------------------------------------------------
# Step 6:  Check the data frame
# 
# The Hmisc describe function is perhaps the first
# function that should be used on the new data
# frame.  It provides documentation of all the
# variables and the frequency tabulation, counts of
# NAs,  and 5 largest and smallest values are
# helpful in detecting data errors.  Typing
# describe(FEV) will write the results to the
# current output window.  To put the results in a
# new window that can persist, even upon exiting
# S-Plus, we use the page function.  The describe
# output can be minimized to an icon but kept ready
# for guiding later steps of the analysis.


page(describe(FEV2), multi=TRUE) 
# multi=TRUE allows that window to persist while
# control is returned to other windows


# The new data frame is OK.  Store it on top of the
# old FEV and then use the graphical user interface
# to delete FEV2 (click on it and hit the Delete
# key) or type rm(FEV2) after the next statement.


FEV <- FEV2


# Next, we can use a variety of other functions to
# check and describe all of the variables.  As we
# are analyzing all or almost all of the variables,
# this is best done without attaching the data
# frame.  Note that plot.data.frame plots inverted
# CDFs for continuous variables and dot plots
# showing frequency distributions of categorical
# ones.


summary(FEV)
# basic summary function (summary.data.frame) 


plot(FEV)                # plot.data.frame 
datadensity(FEV)         
# rug plots and freq. bar charts for all var.


hist.data.frame(FEV)     
# for variables having > 2 values 


by(FEV, FEV$smoke, summary)  
# use basic summary function with stratification




# -----------------------------------------------------------------------
# Step 7:  Do detailed analyses involving individual
#          variables
# 
# Analyses based on the formula language can use
# data= so attaching the data frame may not be
# required.  This saves memory.  Here we use the
# Hmisc summary.formula function to compute 5
# statistics on height, stratified separately by age
# quartile and by sex.


options(width=80) 
summary(height ~ age + sex, data=FEV,
        fun=function(y)c(smean.sd(y),
                         smedian.hilow(y,conf.int=.5)))
# This computes mean height, S.D., median, outer quartiles


fit <- lm(height ~ age*sex, data=FEV) 
summary(fit)


# For this analysis we could also have attached the
# data frame in search position 2.  For other
# analyses, it is mandatory to attach the data frame
# unless FEV$ prefixes each variable name.
# Important: DO NOT USE attach(FEV, 1) or
# attach(FEV, pos=1, \dots) if you are only analyzing
# and not changing the variables, unless you really
# need to avoid conflicts with variables in search
# position 1 that have the same names as the
# variables in FEV.  Attaching into search position
# 1 will cause S-Plus to be more of a memory hog.


attach(FEV)
# Use e.g. attach(FEV[,Cs(age,sex)]) if you only
# want to analyze a small subset of the variables
# Use e.g. attach(FEV[FEV$sex=='male',]) to
# analyze a subset of the observations


summary(height ~ age + sex,
        fun=function(y)c(smean.sd(y),
          smedian.hilow(y,conf.int=.5)))
fit <- lm(height ~ age*sex)


# Run generic summary function on height and fev, 
# stratified by sex
by(data.frame(height,fev), sex, summary)


# Cross-classify into 4 sex x smoke groups
by(FEV, list(sex,smoke), summary)


# Plot 5 quantiles
s <- summary(fev ~ age + sex + height,
              fun=function(y)quantile(y,c(.1,.25,.5,.75,.9)))


plot(s, which=1:5, pch=c(1,2,15,2,1), #pch=c('=','[','o',']','='), 
     main='A Discovery', xlab='FEV')


# Use the nonparametric bootstrap to compute a 
# 0.95 confidence interval for the population mean fev
smean.cl.boot(fev)    # in Hmisc


# Use the Statistics \dots Compare Samples \dots One Sample 
# keys to get a normal-theory-based C.I.  Then do it 
# more manually.  The following method assumes that 
# there are no NAs in fev


sd <- sqrt(var(fev))
xbar <- mean(fev)
xbar
sd
n <- length(fev)
qt(.975,n-1)     
# prints 0.975 critical value of t dist. with n-1 d.f.


xbar + c(-1,1)*sd/sqrt(n)*qt(.975,n-1)   
# prints confidence limits


# Fit a linear model
# fit <- lm(fev ~ other variables \dots)


detach()


# The last command is only needed if you want to
# start operating on another data frame and you want
# to get FEV out of the way.




# -----------------------------------------------------------------------
# Creating data frames from scratch
# 
# Data frames can be created from within S-Plus.  To
# create a small data frame containing ordinary
# data, you can use something like


dframe <- data.frame(age=c(10,20,30), 
                     sex=c('male','female','male'))


# You can also create a data frame using the Data
# Sheet.  Create an empty data frame with the
# correct variable names and types, then edit in the
# data.


dd <- data.frame(age=numeric(0),sex=character(0))


# The sex variable will be stored as a factor, and
# levels will be automatically added to it as you
# define new values for sex in the Data Sheet's sex
# column.
# 
# When the data frame you need to create is defined
# by systematically varying variables (e.g., all
# possible combinations of values of each variable),
# the expand.grid function is useful for quickly
# creating the data.  Then you can add
# non-systematically-varying variables to the object
# created by expand.grid, using programming
# statements or editing the Data Sheet.  This
# process is useful for creating a data frame
# representing all the values in a printed table.
# In what follows we create a data frame
# representing the combinations of values from an 8
# x 2 x 2 x 2 (event x method x sex x what) table,
# and add a non-systematic variable percent to the
# data.


jcetable <- expand.grid(
 event=c('Wheezing at any time',
         'Wheezing and breathless',
         'Wheezing without a cold',
         'Waking with tightness in the chest',
         'Waking with shortness of breath',
         'Waking with an attack of cough',
         'Attack of asthma',
         'Use of medication'),
 method=c('Mail','Telephone'), 
 sex=c('Male','Female'),
 what=c('Sensitivity','Specificity'))


jcetable$percent <- 
c(756,618,706,422,356,578,289,333,
  576,421,789,273,273,212,212,212,
  613,763,713,403,377,541,290,226,
  613,684,632,290,387,613,258,129,
  656,597,438,780,732,679,938,919,
  714,600,494,877,850,703,963,987,
  755,420,480,794,779,647,956,941,
  766,423,500,833,833,604,955,986) / 10


# In jcetable, event varies most rapidly, then
# method, then sex, and what.
}
}
\keyword{data}
\keyword{manip}
\keyword{programming}
\keyword{interface}
\keyword{htest}
\concept{overview}

\eof
\name{dataRep}
\alias{dataRep}
\alias{print.dataRep}
\alias{predict.dataRep}
\alias{print.predict.dataRep}
\alias{roundN}
\alias{[.roundN}
\title{
Representativeness of Observations in a Data Set
}
\description{
These functions are intended to be used to describe how well a given
set of new observations (e.g., new subjects) were represented in a
dataset used to develop a predictive model.
The \code{dataRep} function forms a data frame that contains all the unique
combinations of variable values that existed in a given set of
variable values.  Cross--classifications of values are created using
exact values of variables, so for continuous numeric variables it is
often necessary to round them to the nearest \code{v} and to possibly
curtail the values to some lower and upper limit before rounding.
Here \code{v} denotes a numeric constant specifying the matching tolerance
that will be used.  \code{dataRep} also stores marginal distribution
summaries for all the variables.  For numeric variables, all 101
percentiles are stored, and for all variables, the frequency
distributions are also stored (frequencies are computed after any
rounding and curtailment of numeric variables).  For the purposes of
rounding and curtailing, the \code{roundN} function is provided.  A \code{print}
method will summarize the calculations made by \code{dataRep}, and if
\code{long=TRUE} all unique combinations of values and their frequencies in
the original dataset are printed.

The \code{predict} method for \code{dataRep} takes a new data frame having
variables named the same as the original ones (but whose factor levels
are not necessarily in the same order) and examines the collapsed
cross-classifications created by \code{dataRep} to find how many
observations were similar to each of the new observations after any
rounding or curtailment of limits is done.  \code{predict} also does some
calculations to describe how the variable values of the new
observations "stack up" against the marginal distributions of the
original data.  For categorical variables, the percent of observations
having a given variable with the value of the new observation (after
rounding for variables that were through \code{roundN} in the formula given
to \code{dataRep}) is computed.  For numeric variables, the percentile of
the original distribution in which the current value falls will be
computed.  For this purpose, the data are not rounded because the 101
original percentiles were retained; linear interpolation is used to
estimate percentiles for values between two tabulated percentiles.
The lowest marginal frequency of matching values across all variables
is also computed.  For example, if an age, sex combination matches 10
subjects in the original dataset but the age value matches 100 ages
(after rounding) and the sex value matches the sex code of 300
observations, the lowest marginal frequency is 100, which is a "best
case" upper limit for multivariable matching.  I.e., matching on all
variables has to result on a lower frequency than this amount.
A \code{print} method for the output of \code{predict.dataRep} prints all
calculations done by \code{predict} by default.  Calculations can be
selectively suppressed.
}
\usage{
dataRep(formula, data, subset, na.action)

roundN(x, tol=1, clip=NULL)

\method{print}{dataRep}(x, long=FALSE, \dots)

\method{predict}{dataRep}(object, newdata, \dots)

\method{print}{predict.dataRep}(x, prdata=TRUE, prpct=TRUE, \dots)
}
\arguments{
\item{formula}{
a formula with no left-hand-side.  Continuous numeric variables in
need of rounding should appear in the formula as e.g. \code{roundN(x,5)} to
have a tolerance of e.g. +/- 2.5 in matching.  Factor or character
variables as well as numeric ones not passed through \code{roundN} are
matched on exactly.
}
\item{x}{
a numeric vector or an object created by \code{dataRep}
}
\item{object}{
the object created by \code{dataRep} or \code{predict.dataRep}
}
\item{data, subset, na.action}{
standard modeling arguments.  Default \code{na.action} is \code{na.delete},
i.e., observations in the original dataset having any variables
missing are deleted up front.
}
\item{tol}{
rounding constant (tolerance is actually \code{tol/2} as values are rounded
to the nearest \code{tol})
}
\item{clip}{
a 2-vector specifying a lower and upper limit to curtail values of \code{x}
before rounding
}
\item{long}{
set to \code{TRUE} to see all unique combinations and frequency count
}
\item{newdata}{
a data frame containing all the variables given to \code{dataRep} but not
necessarily in the same order or having factor levels in the same order
}
\item{prdata}{
set to \code{FALSE} to suppress printing \code{newdata} and the count of matching
observations (plus the worst-case marginal frequency). 
}
\item{prpct}{set to \code{FALSE} to not print percentiles and percents}
\item{\dots}{unused}
}
\value{
\code{dataRep} returns a list of class \code{"dataRep"} containing the collapsed
data frame and frequency counts along with marginal distribution
information.  \code{predict} returns an object of class \code{"predict.dataRep"}
containing information determined by matching observations in
\code{newdata} with the original (collapsed) data.
}
\section{Side Effects}{
\code{print.dataRep} prints.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{round}}, \code{\link{table}}
}
\examples{
set.seed(13)
num.symptoms <- sample(1:4, 1000,TRUE)
sex <- factor(sample(c('female','male'), 1000,TRUE))
x    <- runif(1000)
x[1] <- NA
table(num.symptoms, sex, .25*round(x/.25))


d <- dataRep(~ num.symptoms + sex + roundN(x,.25))
print(d, long=TRUE)


predict(d, data.frame(num.symptoms=1:3, sex=c('male','male','female'),
                      x=c(.03,.5,1.5)))
}
\keyword{datasets}
\keyword{category}
\keyword{cluster}
\keyword{manip}
\keyword{models}
% Converted by Sd2Rd version 1.21.

\eof
\name{deff}
\alias{deff}
\title{
Design Effect and Intra-cluster Correlation
}
\description{
Computes the Kish design effect and corresponding intra-cluster correlation
for a single cluster-sampled variable
}
\usage{
deff(y, cluster)
}
\arguments{
\item{y}{
variable to analyze
}
\item{cluster}{
a variable whose unique values indicate cluster membership.  Any
type of variable is allowed.
}}
\value{
a vector with named elements \code{n} (total number of non-missing observations),
 \code{clusters} (number of clusters after deleting missing data), \code{rho}
(intra-cluster correlation), and \code{deff} (design effect).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[Design]{bootcov}}, \code{\link[Design]{robcov}}
}
\examples{
set.seed(1)
blood.pressure <- rnorm(1000, 120, 15)
clinic <- sample(letters, 1000, replace=TRUE)
deff(blood.pressure, clinic)
}
\keyword{htest}
\concept{study design}
\concept{cluster sampling}

\eof
\name{describe}
\alias{describe}
\alias{describe.default}
\alias{describe.vector}
\alias{describe.matrix}
\alias{describe.formula}
\alias{describe.data.frame}
\alias{print.describe}
\alias{print.describe.single}
\alias{[.describe}
\alias{latex.describe}
\alias{latex.describe.single}
\title{
Concise Statistical Description of a Vector, Matrix, Data Frame, or Formula
}
\description{
\code{describe} is a generic method that invokes \code{describe.data.frame},
\code{describe.matrix}, \code{describe.vector}, or
\code{describe.formula}. \code{describe.vector} is the basic 
function for handling a single variable.
This function determines whether the variable is character, factor,
category, binary, discrete numeric, and continuous numeric, and prints
a concise statistical summary according to each. A numeric variable is
deemed discrete if it has <= 10 unique values. In this case,
quantiles are not printed. A frequency table is printed 
for any non-binary variable if it has no more than 20 unique
values.  For any variable with at least 20 unique values, the 5 lowest
and highest values are printed.  \code{describe} is especially useful for
describing data frames created by \code{sas.get}, as SAS labels, formats,
value labels, and frequencies of special missing values are printed.

For a binary variable, the sum (number of 1's) and mean (proportion of
1's) are printed. If the first argument is a formula, a model frame
is created and passed to describe.data.frame.  If a variable
is of class \code{"impute"}, a count of the number of imputed values is
printed.  If a date variable has an attribute \code{partial.date}
(this is set up by \code{sas.get}), counts of how many partial dates are
actually present (missing month, missing day, missing both) are also presented.
If a variable was created by the special-purpose function \code{substi} (which
substitutes values of a second variable if the first variable is NA),
the frequency table of substitutions is also printed.  

A latex method
exists for converting the \code{describe} object to a LaTeX file.  For
numeric variables having at least 20 unique values, \code{describe} saves
in its returned object the frequencies of 100 evenly spaced bins
running from minimum observed value to the maximum.  \code{latex} inserts a
spike histogram displaying these frequency counts in the tabular
material using the LaTeX picture environment.  For example output see
\url{hesweb1.med.virginia.edu/s/doc/describe.example.pdf}.

Sample weights may be specified to any of the functions, resulting
in weighted means, quantiles, and frequency tables.
}
\usage{
\method{describe}{vector}(x, descript, exclude.missing=TRUE, digits=4,
         weights, normwt, \dots)
\method{describe}{matrix}(x, descript, exclude.missing=TRUE, digits=4, \dots)
\method{describe}{data.frame}(x, descript, exclude.missing=TRUE,
    digits=4, \dots)
\method{describe}{formula}(x, descript, data, subset, na.action,
    digits=4, weights, \dots)
\method{print}{describe}(x, condense=TRUE, \dots)
\method{latex}{describe}(object, title=NULL, condense=TRUE, 
      file=paste('describe',first.word(expr=attr(object,'descript')),'tex',sep='.'),
      append=FALSE, size='small', tabular=TRUE, \dots)
\method{latex}{describe.single}(object, title=NULL, condense=TRUE, vname,
      file, append=FALSE, size='small', tabular=TRUE, \dots)
}
\arguments{
\item{x}{
  a data frame, matrix, vector, or formula.  For a data frame, the 
  \code{describe.data.frame}
  function is automatically invoked.  For a matrix, \code{describe.matrix} is
  called.  For a formula, describe.data.frame(model.frame(x))
  is invoked. The formula may or may not have a response variable.  For
  \code{print} or \code{latex}, \code{x} is an object created by
  \code{describe}.
}
\item{descript}{
  optional title to print for x. The default is the name of the argument
  or the "label" attributes of individual variables. When the first argument
  is a formula, \code{descript} defaults to a character representation of
  the formula.
}
\item{exclude.missing}{
  set toTRUE to print the names of variables that contain only missing values.
  This list appears at the bottom of the printout, and no space is taken
  up for such variables in the main listing.
}
\item{digits}{
  number of significant digits to print
}
\item{weights}{
  a numeric vector of frequencies or sample weights.  Each observation
  will be treated as if it were sampled \code{weights} times.
}
\item{normwt}{
  The default, \code{normwt=FALSE} results in the use of \code{weights} as
  weights in computing various statistics.  In this case the sample size
  is assumed to be equal to the sum of \code{weights}.  Specify
  \code{normwt=TRUE} to divide 
  \code{weights} by a constant so that \code{weights} sum to the number of
  observations (length of vectors specified to \code{describe}).  In this
  case the number of observations is taken to be the actual number of
  records given to \code{describe}.
}
\item{object}{a result of \code{describe}}
\item{title}{unused}
\item{condense}{
  default isTRUE to condense the output with regard to the 5 lowest and
  highest values and the frequency table
}
\item{data}{
}
\item{subset}{
}
\item{na.action}{
  There are used if a formula is specified.  \code{na.action} defaults to
  \code{na.retain} which does not delete any \code{NA}s from the data frame.
  Use \code{na.action=na.omit} or \code{na.delete} to drop any observation with
  any \code{NA} before processing.
}
\item{\dots}{
  arguments passed to \code{describe.default} which are passed to calls
  to \code{format} for numeric variables.  For example if using R
  \code{POSIXct} date/time formats, specifying
  \code{describe(d,format='\%d\%b\%y')} will print date/time variables as
  \code{"01Jan2000"}.  This is useful for omitting the time
  component.  See the help file for \code{format.POSIXct} for more
  information.  For \code{latex} methods, \dots is ignored.}
\item{file}{
name of output file (should have a suffix of .tex).  Default name is
formed from the first word of the \code{descript} element of the
\code{describe} object, prefixed by \code{"describe"}.  Set
\code{file=""} to send LaTeX code to standard output instead of a file.
}
\item{append}{
set to \code{TRUE} to have \code{latex} append text to an existing file
named \code{file}
}
\item{size}{
LaTeX text size (\code{"small"}, the default, or \code{"normalsize"}, \code{"tiny"},
\code{"scriptsize"}, etc.) for the \code{describe} output in LaTeX.
}
\item{tabular}{
  set to \code{FALSE} to use verbatim rather than tabular environment
  for the summary statistics output.  By default, tabular is used if the
  output is not too wide.}
\item{vname}{unused argument in \code{latex.describe.single}}
}
\value{
a list containing elements \code{descript}, \code{counts},
\code{values}.  The list  is of class \code{describe}.  If the input
object was a matrix or a data 
frame, the list is a list of lists, one list for each variable
analyzed. \code{latex} returns a standard \code{latex} object.  For numeric
variables having at least 20 unique values, an additional component
\code{intervalFreq}.  This component is a list with two elements, \code{range}
(containing two values) and \code{count}, a vector of 100 integer frequency
counts.
}
\details{
If \code{options(na.detail.response=TRUE)}
has been set and \code{na.action} is \code{"na.delete"} or
\code{"na.keep"}, summary  statistics on
the response variable are printed separately for missing and non-missing
values of each predictor.  The default summary function returns
the number of non-missing response values and the mean of the last
column of the response values, with a \code{names} attribute of \code{c("N","Mean")}.
When the response is a \code{Surv} object and the mean is used, this will
result in the crude proportion of events being used to summarize
the response.  The actual summary function can be designated through
\code{options(na.fun.response = "function name")}.
}
\author{
Frank Harrell
\cr
Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\seealso{
\code{\link{sas.get}}, \code{\link{quantile}}, \code{\link{table}}, \code{\link{summary}},  \code{\link{model.frame.default}},
\code{\link{naprint}}, \code{\link{lapply}}, \code{\link{tapply}}, \code{\link{Surv}}, \code{\link{na.delete}}, \code{\link{na.keep}},
\code{\link{na.detail.response}}, \code{\link{latex}}
}
\examples{
set.seed(1)
describe(runif(200),dig=2)    #single variable, continuous
                              #get quantiles .05,.10,\dots

dfr <- data.frame(x=rnorm(400),y=sample(c('male','female'),400,TRUE))
describe(dfr)

\dontrun{
d <- sas.get(".","mydata",special.miss=TRUE,recode=TRUE)
describe(d)      #describe entire data frame
attach(d, 1)
describe(relig)  #Has special missing values .D .F .M .R .T
                 #attr(relig,"label") is "Religious preference"

#relig : Religious preference  Format:relig
#    n missing  D  F M R T unique 
# 4038     263 45 33 7 2 1      8
#
#0:none (251, 6\%), 1:Jewish (372, 9\%), 2:Catholic (1230, 30\%) 
#3:Jehovah's Witnes (25, 1\%), 4:Christ Scientist (7, 0\%) 
#5:Seventh Day Adv (17, 0\%), 6:Protestant (2025, 50\%), 7:other (111, 3\%) 


# Method for describing part of a data frame:
 describe(death.time ~ age*sex + rcs(blood.pressure))
 describe(~ age+sex)
 describe(~ age+sex, weights=freqs)  # weighted analysis

 fit <- lrm(y ~ age*sex + log(height))
 describe(formula(fit))
 describe(y ~ age*sex, na.action=na.delete)   
# report on number deleted for each variable
 options(na.detail.response=TRUE)  
# keep missings separately for each x, report on dist of y by x=NA
 describe(y ~ age*sex)
 options(na.fun.response="quantile")
 describe(y ~ age*sex)   # same but use quantiles of y by x=NA

 d <- describe(my.data.frame)
 d$age                   # print description for just age
 d[c('age','sex')]       # print description for two variables
 d[sort(names(d))]       # print in alphabetic order by var. names
 d2 <- d[20:30]          # keep variables 20-30
 page(d2)                # pop-up window for these variables

# Test date/time formats and suppression of times when they don't vary
 library(chron)
 d <- data.frame(a=chron((1:20)+.1),
                 b=chron((1:20)+(1:20)/100),
                 d=ISOdatetime(year=rep(2003,20),month=rep(4,20),day=1:20,
                               hour=rep(11,20),min=rep(17,20),sec=rep(11,20)),
                 f=ISOdatetime(year=rep(2003,20),month=rep(4,20),day=1:20,
                               hour=1:20,min=1:20,sec=1:20),
                 g=ISOdate(year=2001:2020,month=rep(3,20),day=1:20))
 describe(d)

}
}
\keyword{interface}
\keyword{nonparametric}
\keyword{category}
\keyword{distribution}
\keyword{robust}
\keyword{models}


\eof
\name{dotchart2}
\alias{dotchart2}
\title{
Enhanced Dot Chart
}
\description{
\code{dotchart2} is an enhanced version of the \code{dotchart} function 
with several new options.
}
\usage{
dotchart2(data, labels, groups, gdata, horizontal=TRUE, pch=16,
          xlab='', ylab='', auxdata, auxgdata=NULL, auxtitle,
          lty=if(.R.) 1 else 2, lines=TRUE, dotsize = .8,
          cex = par("cex"), cex.labels = cex,
          cex.group.labels = cex.labels*1.25, sort.=TRUE, 
	      add=FALSE, dotfont=par('font'), groupfont=if(under.unix)5 else 1, 
	      reset.par=add, xaxis=TRUE, width.factor=if(.R.)1.5 else 1,
          lcolor=if(.R.)'gray' else par('col'), ...)
}
\arguments{
  \item{data}{a numeric vector whose values are shown on the x-axis}
  \item{labels}{a vector of labels for each point, corresponding to
	\code{x}.  If omitted, \code{names(data)} are used, and if there are
	no \code{names}, integers prefixed by \code{"#"} are used.}
  \item{groups}{an optional categorical variable indicating how
	\code{data} values are grouped}
  \item{gdata}{data values for groups, typically summaries such as group
	medians}
  \item{horizontal}{set to \code{FALSE} to make the chart vertical
	instead of the default}
  \item{pch}{
	default character number or value for plotting dots in dot charts.
	The default is 16.}
  \item{xlab}{x-axis title}
  \item{ylab}{y-axis title}
  \item{auxdata}{
	a vector of auxiliary data given to \code{dotchart2}, of the same length
	as the first (\code{data}) argument.  If present, this
	vector of values will be printed outside the right margin of the dot
	chart.  Usually \code{auxdata} represents cell sizes.
  }
  \item{auxgdata}{
	similar to \code{auxdata} but corresponding to the \code{gdata}
	argument.  These usually represent overall sample sizes for each
	group of lines.}
  \item{auxtitle}{
	if \code{auxdata} is given, \code{auxtitle} specifies a column
	heading for the extra printed data in the chart, e.g., \code{"N"}}
  \item{lty}{line type for horizontal lines.  Default is 1 for R, 2 for S-Plus}
  \item{lines}{set to \code{FALSE} to suppress drawing of reference
	lines}
  \item{dotsize}{
	\code{cex} value for drawing dots.  Default is 0.8.  Note that the original
	\code{dotchart} function used a default of 1.2.}
  \item{cex}{see \code{\link{par}}}
  \item{cex.labels}{
	\code{cex} parameter that applies only to the line labels for the
	dot chart \code{cex} parameter for major grouping labels for
	\code{dotchart2}.  Defaults to \code{cex}.}
  \item{cex.group.labels}{value of \code{cex} corresponding to \code{gdata}}
  \item{sort.}{
	set to \code{FALSE} to keep \code{dotchart2} from sorting the input
	data, i.e., it will assume that the data are already properly
	arranged.  This is especially useful when you are using \code{gdata}
	and \code{groups} and you want to control the
	order that groups appear on the chart (from top to bottom).}
  \item{add}{set to \code{TRUE} to add to an existing plot}
  \item{dotfont}{
	font number of plotting dots.  Default is one.  Use \code{-1} to
	use "outline" fonts.  For example, \code{pch=183, dotfont=-1}
	plots an open circle for UNIX on postscript.  \code{pch=1} makes
	an open octagon under Windows.}
  \item{groupfont}{
	font number to use in drawing \code{group} labels for \code{dotchart2}.
	Default is \code{5} for UNIX, which is usually Helvetica bold.  For
	Microsoft Windows, the default is \code{1}.  The font number 
	corresponding to some bold font is recommended, if you can figure this
	out for Windows S-Plus.
  }
  \item{reset.par}{
	set to \code{FALSE} to cause \code{dotchart2} to not reset the \code{par}
	parameters when finished.  This is useful when \code{add=TRUE} is about to
	be used in another call.  The default is to reset the \code{par}
	parameters if \code{add=TRUE} and not if \code{add=FALSE}, i.e., the
	program assumes that only one set of points will be added to an
	existing set.  If you fail to use \code{reset.par=TRUE} for the 
	first of a series of plots, the next call to \code{plot} with
	\code{add=TRUE} will result in distorted x-axis scaling.}
  \item{xaxis}{set to \code{FALSE} to suppress drawing x-axis}
  \item{width.factor}{
	When the calculated left margin turns out to be faulty, specify a
	factor by which to multiple the left margin as \code{width.factor} to get
	the appropriate space for labels on horizonal charts.}
  \item{lcolor}{
	color for horizontal reference lines.  Default is \code{"gray"} for R,
	\code{par("col")} for S-Plus.}
  \item{...}{arguments passed to \code{plot.default}}
}
\section{Side Effects}{
\code{dotchart} will leave \code{par} altered if \code{reset.par=FALSE}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{dotchart}}
}
\keyword{hplot}

\eof
\name{ecdf}
\alias{ecdf}
\alias{ecdf.default}
\alias{ecdf.data.frame}
\alias{ecdf.formula}
\alias{panel.ecdf}
\alias{prepanel.ecdf}
\title{
Empirical Cumulative Distribution Plot
}
\description{
Computes coordinates of cumulative distribution function of x, and by defaults
plots it as a step function.  A grouping variable may be specified so that
stratified estimates are computed and (by default) plotted.  If there is
more than one group, the \code{labcurve} function is used (by default) to label
the multiple step functions or to draw a legend defining line types, colors,
or symbols by linking them with group labels.  A \code{weights} vector may
be specified to get weighted estimates.  Specify \code{normwt} to make
\code{weights} sum to the length of \code{x} (after removing NAs).  Other wise
the total sample size is taken to be the sum of the weights.

\code{ecdf} is actually a method, and \code{ecdf.default} is what's
called for a vector argument.  \code{ecdf.data.frame} is called when the
first argument is a data frame.  This function can automatically set up
a matrix of ECDFs and wait for a mouse click if the matrix requires more
than one page.  Categorical variables, character variables, and
variables having fewer than a set number of unique values are ignored.
If \code{par(mfrow=..)} is not set up before \code{ecdf.data.frame} is
called, the function will try to figure the best layout depending on the
number of variables in the data frame.  Upon return the original
\code{mfrow} is left intact.

When the first argument to \code{ecdf} is a formula, a Trellis/Lattice function
\code{ecdf.formula} is called.  This allows for multi-panel
conditioning, superposition using a \code{groups} variable, and other
Trellis features, along with the ability to easily plot transformed
ECDFs using the \code{fun} argument.  For example, if \code{fun=qnorm},
the inverse normal transformation will be used for the y-axis.  If the
transformed curves are linear this indicates normality.  Like the
\code{xYplot} function, \code{ecdf} will create a function \code{Key} if
the \code{groups} variable is used.  This function can be invoked by the
user to define the keys for the groups.
}

\usage{
ecdf(x, \dots)

\method{ecdf}{default}(x, what=c('F','1-F','f'), weights, normwt=FALSE,
     xlab, ylab, q, pl=TRUE, add=FALSE, lty=1, 
     col=1, group=rep(1,length(x)), label.curves=TRUE, xlim, 
     subtitles=TRUE, datadensity=c('none','rug','hist','density'),
     side=1, 
     frac=switch(datadensity,none=NA,rug=.03,hist=.1,density=.1),
     dens.opts=NULL, lwd, \dots)


\method{ecdf}{data.frame}(x, group=rep(1,nrows), weights, normwt,
     label.curves=TRUE, n.unique=10, na.big=FALSE, subtitles=TRUE, 
     vnames=c('labels','names'),\dots)

\method{ecdf}{formula}(x, data, groups, prepanel=prepanel.ecdf,
     panel=panel.ecdf, \dots, xlab, ylab, fun=function(x)x, subset=TRUE)
}
\arguments{
\item{x}{a numeric vector, data frame, or Trellis/Lattice formula}
\item{what}{
The default is \code{"F"} which results in plotting the fraction of values
<= x.  Set to \code{"1-F"} to plot the fraction > x or \code{"f"} to plot the
cumulative frequency of values <= x.
}
\item{weights}{
numeric vector of weights.  Omit or specify a zero-length vector or
NULL to get unweighted estimates.
}
\item{normwt}{
see above
}
\item{xlab}{
x-axis label.  Default is label(x) or name of calling argument.  For
\code{ecdf.formula}, \code{xlab} defaults to the \code{label} attribute
of the x-axis variable.
}
\item{ylab}{
y-axis label.  Default is \code{"Proportion <= x"}, \code{"Proportion > x"}, 
or "Frequency <= x" depending on value of \code{what}.
}
\item{q}{
a vector for quantiles for which to draw reference lines on the plot.
Default is not to draw any.
}
\item{pl}{
set to F to omit the plot, to just return estimates.
}
\item{add}{
set toTRUE to add the cdf to an existing plot.
}
\item{lty}{
integer line type for plot.  If \code{group} is specified, this can be a vector.
}
\item{lwd}{
  line width for plot.  Can be a vector corresponding to \code{group}s.
  }
\item{col}{
color for step function.  Can be a vector.
}
\item{group}{
a numeric, character, or \code{factor} categorical variable used for stratifying
estimates.  If \code{group} is present, as many ECDFs are drawn as there are
non--missing group levels.
}
\item{label.curves}{
applies if more than one \code{group} exists.
Default is \code{TRUE} to use \code{labcurve} to label curves where they are farthest
apart.  Set \code{label.curves} to a \code{list} to specify options to
\code{labcurve}, e.g., \code{label.curves=list(method="arrow", cex=.8)}.
These option names may be abbreviated in the usual way arguments
are abbreviated.  Use for example \code{label.curves=list(keys=1:5)}
to draw symbols periodically (as in \code{pch=1:5} - see \code{points})
on the curves and automatically position a legend
in the most empty part of the plot.  Set \code{label.curves=FALSE} to
suppress drawing curve labels.  The \code{col}, \code{lty}, and \code{type}
parameters are automatically passed to \code{labcurve}, although you
can override them here.  You can set \code{label.curves=list(keys="lines")} to
have different line types defined in an automatically positioned key.
}
\item{xlim}{
x-axis limits.  Default is entire range of \code{x}.
}
\item{subtitles}{
set to \code{FALSE} to suppress putting a subtitle at the bottom left of each
plot.  The subtitle indicates the numbers of
non-missing and missing observations, which are labeled \code{n}, \code{m}.
}
\item{datadensity}{
If \code{datadensity} is not \code{"none"}, either \code{scat1d} or \code{histSpike} is called to
add a rug plot (\code{datadensity="rug"}), spike histogram
(\code{datadensity="hist"}), or smooth density estimate (\code{"density"}) to
the bottom or top of the ECDF.
}
\item{side}{
If \code{datadensity} is not \code{"none"}, the default is to place the additional
information on top of the x-axis (\code{side=1}).  Use \code{side=3} to place at
the top of the graph.
}
\item{frac}{
passed to \code{histSpike}
}
\item{dens.opts}{
a list of optional arguments for \code{histSpike}
}
\item{...}{
other parameters passed to plot if add=F.  For data frames, other
parameters to pass to \code{ecdf.default}.
For \code{ecdf.formula}, if \code{groups} is not used, you can also add
data density information to each panel's ECDF by specifying the
\code{datadensity} and optional \code{frac}, \code{side},
\code{dens.opts} arguments. 
}
\item{n.unique}{
minimum number of unique values before an ECDF is drawn for a variable
in a data frame.  Default is 10.
}
\item{na.big}{
set to \code{TRUE} to draw the number of NAs in larger letters in the middle of
the plot for \code{ecdf.data.frame}
}
\item{vnames}{
By default, variable labels are used to label x-axes.  Set \code{vnames="names"}
to instead use variable names.
}
\item{method}{
method for computing the empirical cumulative distribution.  See
\code{wtd.ecdf}.  The default is to use the standard \code{"i/n"} method as is
used by the non-Trellis versions of \code{ecdf}.
}
\item{fun}{
a function to transform the cumulative proportions, for the
Trellis-type usage of \code{ecdf}
}
\item{data}{}
\item{groups}{}
\item{subset}{}
\item{prepanel}{}
\item{panel}{the usual Trellis/Lattice parameters, with \code{groups}
  causing \code{ecdf.formula} to overlay multiple ECDFs on one panel.}
}
\value{
for \code{ecdf.default} an invisible list with elements x and y giving the
coordinates of the cdf.  If there is more than one \code{group}, a list of
such lists is returned.  An attribute, \code{N}, is in the returned
object.  It contains the elements \code{n} and \code{m}, the number of
non-missing and missing observations, respectively.
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\section{Side Effects}{
plots
}
\seealso{
\code{\link{wtd.ecdf}}, \code{\link{label}}, \code{\link{table}}, \code{\link{cumsum}}, \code{\link{labcurve}}, \code{\link{xYplot}}, \code{\link{histSpike}}
}
\examples{
set.seed(1)
ch <- rnorm(1000, 200, 40)
ecdf(ch, xlab="Serum Cholesterol")
scat1d(ch)                       # add rug plot
histSpike(ch, add=TRUE, frac=.15)   # add spike histogram
# Better: add a data density display automatically:
ecdf(ch, datadensity='density')


label(ch) <- "Serum Cholesterol"
ecdf(ch)
other.ch <- rnorm(500, 220, 20)
ecdf(other.ch,add=TRUE,lty=2)


sex <- factor(sample(c('female','male'), 1000, TRUE))
ecdf(ch, q=c(.25,.5,.75))  # show quartiles
ecdf(ch, group=sex,
     label.curves=list(method='arrow'))


# Example showing how to draw multiple ECDFs from paired data
pre.test <- rnorm(100,50,10)
post.test <- rnorm(100,55,10)
x <- c(pre.test, post.test)
g <- c(rep('Pre',length(pre.test)),rep('Post',length(post.test)))
ecdf(x, group=g, xlab='Test Results', label.curves=list(keys=1:2))
# keys=1:2 causes symbols to be drawn periodically on top of curves


# Draw a matrix of ECDFs for a data frame
m <- data.frame(pre.test, post.test, 
                sex=sample(c('male','female'),100,TRUE))
ecdf(m, group=m$sex, datadensity='rug')


freqs <- sample(1:10, 1000, TRUE)
ecdf(ch, weights=freqs)  # weighted estimates


# Trellis/Lattice examples:


region <- factor(sample(c('Europe','USA','Australia'),100,TRUE))
year <- factor(sample(2001:2002,1000,TRUE))
ecdf(~ch | region*year, groups=sex)
Key()           # draw a key for sex at the default location
# Key(locator(1)) # user-specified positioning of key
age <- rnorm(1000, 50, 10)
ecdf(~ch | equal.count(age), groups=sex)  # use overlapping shingles
ecdf(~ch | sex, datadensity='hist', side=3)  # add spike histogram at top
}
\keyword{nonparametric}
\keyword{hplot}
\keyword{methods}
\keyword{distribution}
\concept{trellis}
\concept{lattice}

\eof
\name{eip}
\alias{eip}
\title{Edit In Place}
\description{
Invokes \code{edit( )} on the object \code{name} and stores the
resulting edited object in place of the original, even if this is a
remote place (as long as the user has write access).  This is more
useful in S-Plus than in \R.
}
\usage{
eip(name)
}
\arguments{
  \item{name}{an object, usually a function}
}
\value{
none
}
\author{Frank Harrell}
\seealso{\code{\link{edit}}}
\examples{
\dontrun{
eip(summary.formula}  # make temporary bug fix in central area
}}
\keyword{utilities}


\eof
\name{errbar}
\alias{errbar}
\title{
Plot Error Bars
}
\description{
\code{errbar} adds vertical error bars to an existing plot or makes a new
plot with error bars.  It can also make a horizontal error bar plot
that shows error bars for group differences as well as bars for
groups.  For the latter type of plot, the lower x-axis scale
corresponds to group estimates and the upper scale corresponds to
differences.  The spacings of the two scales are identical but the
scale for differences has its origin shifted so that zero may be
included.  If at least one of the confidence intervals includes zero,
a vertical dotted reference line at zero is drawn.
}
\usage{
errbar(x, y, yplus, yminus, cap, xlab, ylab, add=FALSE, 
       lty=1, ylim, lwd=1, Type=rep(1,length(y)), \dots )
}
\arguments{
\item{x}{
vector of numeric x values (for vertical error bars) or a factor or
character variable (for horizontal error bars, \code{x} representing the
group labels)
}
\item{y}{
vector of y values.
}
\item{yplus}{
vector of y values: the tops of the error bars.
}
\item{yminus}{
vector of y values: the bottoms of the error bars.
}
\item{cap}{
The width of the little lines at the tops and bottoms of the error bars
in units of the width of the plot.  Default is .015.
}
\item{xlab}{
}
\item{ylab}{
optional axis labels if \code{add=FALSE}.  Defaults to blank for horizontal charts.
}
\item{add}{
Set toTRUE to add bars to an existing plot (available only for vertical
error bars)
}
\item{lty}{
Line type for bars
}
\item{ylim}{
Y-axis limits.  Default is to use range of yminus and yplus.  For
horizonal charts, \code{ylim} is really the \code{x}-axis range, excluding
differences.
}
\item{lwd}{
Line width for line segments (not main line)
}
\item{Type}{
used for horizontal bars only.  Is an integer vector with values \code{1}
if corresponding values represent simple estimates, \code{2} if they
represent differences.
}
\item{...}{
other parameters passed to plot function.
}}
\author{
Charles Geyer, University of Chicago.  Modified by Frank Harrell,
Vanderbilt University, to handle missing data, to add the parameters
\code{add} and \code{lty}, and to implement horizontal charts with differences.
}
\examples{
set.seed(1)
x <- 1:10
y <- x + rnorm(10)
delta <- runif(10)
errbar( x, y, y + delta, y - delta )


# Show bootstrap nonparametric CLs for 3 group means and for
# pairwise differences on same graph
group <- sample(c('a','b','d'), 200, TRUE)
y     <- runif(200) + .25*(group=='b') + .5*(group=='d')
cla <- smean.cl.boot(y[group=='a'],B=100,reps=TRUE)  # usually B=1000
a   <- attr(cla,'reps')
clb <- smean.cl.boot(y[group=='b'],B=100,reps=TRUE)
b   <- attr(clb,'reps')
cld <- smean.cl.boot(y[group=='d'],B=100,reps=TRUE)
d   <- attr(cld,'reps')
a.b <- quantile(a-b,c(.025,.975))
a.d <- quantile(a-d,c(.025,.975))
b.d <- quantile(b-d,c(.025,.975))
errbar(c('a','b','d','a - b','a - d','b - d'),
       c(cla[1],clb[1],cld[1],cla[1]-clb[1],cla[1]-cld[1],clb[1]-cld[1]),
       c(cla[3],clb[3],cld[3],a.b[2],a.d[2],b.d[2]),
       c(cla[2],clb[2],cld[2],a.b[1],a.d[1],b.d[1]),
       Type=c(1,1,1,2,2,2))


       
rm(x,y,delta,group,a,b,d,a.b,a.d,b.d,cla,clb,cld)
}
\keyword{hplot}
% Converted by Sd2Rd version 1.21.

\eof
\name{event.chart}
\alias{event.chart}
\alias{event.convert}
\title{
Flexible Event Chart for Time-to-Event Data
}
\description{
Creates an event chart on the current graphics device.  Also, allows user
to plot legend on plot area or on separate page.
Contains features useful for plotting data with time-to-event outcomes
Which arise in a variety of studies
including randomized clinical trials and non-randomized cohort studies.
This function can use as input a matrix or a data frame, although greater
utility and ease of use will be seen with a data frame.
}
\usage{
event.chart(data, subset.r = 1:dim(data)[1], subset.c = 1:dim(data)[2],
        sort.by = NA, sort.ascending =TRUE,
        sort.na.last =TRUE, sort.after.subset =TRUE,
        y.var = NA, y.var.type = 'n',
        y.jitter =FALSE, y.jitter.factor = 1,
        y.renum =FALSE, NA.rm =FALSE, x.reference = NA,
        now = max(data[,subset.c], na.rm =TRUE),
        now.line =FALSE, now.line.lty = 2,
        now.line.lwd = 1, now.line.col = 1, pty='m',
        date.orig = c(1,1,1960), titl = 'Event Chart',


        y.idlabels = NA, y.axis = 'auto',
        y.axis.custom.at = NA, y.axis.custom.labels = NA,
        y.julian =FALSE, y.lim.extend = c(0,0),
        y.lab = ifelse(is.na(y.idlabels), '' , as.character(y.idlabels)),


        x.axis.all =TRUE, x.axis = 'auto',
        x.axis.custom.at = NA, x.axis.custom.labels = NA,
        x.julian =FALSE, x.lim.extend = c(0,0), x.scale = 1,
        x.lab = ifelse(x.julian, 'Follow-up Time', 'Study Date'),


        line.by = NA, line.lty = 1, line.lwd = 1, line.col = 1,
        line.add = NA, line.add.lty = NA,
        line.add.lwd = NA, line.add.col = NA,
        point.pch = 1:length(subset.c),
        point.cex = rep(0.6,length(subset.c)),
        point.col = rep(1,length(subset.c)),


        legend.plot =FALSE, legend.location = 'o', legend.titl = titl,
        legend.titl.cex = 3.0, legend.titl.line = 1.0,
        legend.point.at = list(x = c(5,95), y = c(95,30)),
        legend.point.pch = point.pch,
        legend.point.text = ifelse(rep(is.data.frame(data),
              length(subset.c)), names(data[,subset.c]), subset.c),
        legend.cex = 2.5, legend.bty = 'n',
        legend.line.at = list(x = c(5,95), y = c(20,5)),
        legend.line.text = names(table(as.character(data[,line.by]),
              exclude = c('','NA'))),
        legend.line.lwd = line.lwd, legend.loc.num = 1,


        \dots)


event.convert(data2, event.time = 1, event.code = 2)
}
\arguments{
\item{data}{
a matrix or data frame with rows corresponding to subjects and
columns corresponding to variables.  Note that for a data frame or
matrix containing multiple time-to-event
data (e.g., time to recurrence, time to death, and time to
last follow-up), one column is required for each specific event.
}
\item{data2}{
a matrix or dataframe with at least 2 columns; by default, the first
column contains the event time and the second column contains the k
event codes (e.g. 1=dead, 0=censord)
}
\item{subset.r}{
subset of rows of original matrix or data frame to place in event chart.
Logical arguments may be used here (e.g., treatment.arm == 'a', if
the data frame, data, has been attached to the search directory;
otherwise, \code{data$treatment.arm == "a"}).
}
\item{subset.c}{
subset of columns of original matrix or data frame to place in event chart;
if working with a data frame, a vector of data frame variable names may be
used for subsetting purposes (e.g., c('randdate', 'event1').
}
\item{sort.by}{
column(s) or data frame variable name(s) with which to sort the chart's output.
The default is NA, thereby resulting in a chart sorted by original row number.
}
\item{sort.ascending}{
logical flag (which takes effect only if the argument sort.by is utilized).
If TRUE (default), sorting is done in ascending order; if F, descending order.
}
\item{sort.na.last}{
logical flag (which takes effect only if the argument sort.by is utilized).
If T (default), NA values are considered as last values in ordering.
}
\item{sort.after.subset}{
logical flag (which takes effect only if the argument sort.by is utilized).
If F, sorting data (via sort.by specified variables
or columns) will be performed prior to row subsetting (via subset.r);
if T (default), row subsetting of original data will be done before sorting.
}
\item{y.var}{
variable name or column number of original matrix or data frame with
which to scale y-axis.
Default is NA, which will result in equally spaced lines on y-axis
(based on original data or sorted data if requested by sort.by).
Otherwise, location of lines on y-axis will be dictated by specified variable
or column.  Examples of specified variables may be date of an event
or a physiological covariate.  Any observation which has
a missing value for the y.var variable will not appear on the graph.
}
\item{y.var.type}{
type of variable specified in y.var (which will only take effect if
argument y.var is utilized). If 'd', specifed variable is a date (either
numeric julian date or an S-Plus dates object);  if 'n', specifed variable
is numeric (e.g., systolic blood pressure level) although not a julian date.
}
\item{y.jitter}{
logical flag (which takes effect only if the argument y.var is utilized).
Due to potential ties in y.var variable, y.jitter (when T) will jitter
the data to allow discrimination between observations at the possible cost
of producing slightly inaccurate dates or covariate values;  if F (the
default), no jittering will be performed.  The y.jitter algorithm
assumes a uniform distribution of observations across the range of y.var.
The algorithm is as follows:


size.jitter <-
( diff(range(y.var)) /  (2 * (length(y.var) - 1)) ) * y.jitter.factor .


The default of y.jitter.factor is 1.  The entire product is then used as an
argument into runif:  y.var <-
y.var + runif(length(y.var), -size.jitter, size.jitter) .
}
\item{y.jitter.factor}{
an argument used with the y.jitter function to scale the range of added noise.
Default is 1.
}
\item{y.renum}{
logical flag.  If T, subset observations are listed on y-axis from
1 to length(subset.r); if F (default), subset observations are listed
on y-axis in original form.  As an example, if subset.r = 301:340 and
y.renum ==TRUE, y-axis will be shown as 1 through 40.  However, if
y.renum ==FALSE, y-axis will be shown as 301 through 340.  The above examples
assume the following argument, NA.rm, is set to F.
}
\item{NA.rm}{
logical flag.  If T, subset observations which have NA for each variable
specified in subset.c will not have an entry on the y-axis.  Also, if
the following argument, x.reference, is specified, observations with
missing x.reference values will also not have an entry on the y-axis.  If F
(default), user can identify those observations which do have NA for
every variable specified in subset.c (or, if x.reference is specified, also
those observations which are missing only the x.reference value); this can
easily be done by examining the resulting y-axis and
recognizing the observations without any plotting symbols.
}
\item{x.reference}{
column of original matrix or data frame with which to reference the x-axis.
That is, if specified, all columns specified in subset.c will be substracted
by x.reference.  An example may be to see the timing of events before and
after treatment or to see time-to-event after entry into study.
The event times will be aligned using the x.reference argument
as the reference point.
}
\item{now}{
the 'now' date which will be used for top of y-axis
when creating the Goldman eventchart (see reference below).
Default is max(data[, subset.c], na.rm =TRUE).
}
\item{now.line}{
logical flag.   A feature utilized by the Goldman Eventchart.
When x.reference is specified as the start of follow-up and
y.var = x.reference, then the Goldman chart can be created.
This argument, if T, will cause the plot region to be square, and will
draw a line with a slope of -1 from the top of the y-axis to the right
end of the x-axis.  Essentially, it denotes end of current follow-up period
for looking at the time-to-event data.  Default is F.
}
\item{now.line.lty}{
line type of now.line.
}
\item{now.line.lwd}{
line width of now.line.
}
\item{now.line.col}{
color of now.line.
}
\item{pty}{
graph option, pty='m' is the default; use pty='s' for the square looking
Goldman's event chart.
}
\item{date.orig}{
date of origin to consider if dates are in julian, SAS , or S-Plus dates
object format;  default is January 1, 1960 (which is the default origin
used by both  S-Plus and SAS).  Utilized when either y.julian =FALSE or
x.julian = F.
}
\item{titl}{
title for event chart.  Default is 'Event Chart'.


}
\item{y.idlabels}{
column or data frame variable name used for y-axis labels.  For example,
if c('pt.no') is specified, patient ID (stored in 'pt.no')
will be seen on y-axis labels
instead of sequence specified by subset.r.  This argument takes precedence
over both y.axis='auto' and y.axis='custom' (see below).
NOTE:  Program will issue warning if this argument is
specified and if is.na(y.var) == F;  y.idlabels will not be
used in this situation.  Also, attempting to plot too many patients
on a single event chart will cause undesirable plotting of y.idlabels.
}
\item{y.axis}{
character string specifying whether program will control labelling
of y-axis (with argument 'auto'), or if user will control labelling
(with argument 'custom').  If 'custom' is chosen, user must specify
location and text of labels using y.axis.custom.at and
y.axis.custom.labels arguments, respectively, listed below.
This argument will not be utilized if y.idlabels is specified.
}
\item{y.axis.custom.at}{
user-specified vector of y-axis label locations.
Must be used when y.axis = 'custom'; will not be used otherwise.
}
\item{y.axis.custom.labels}{
user-specified vector of y-axis labels.
Must be used when y.axis = 'custom'; will not be used otherwise.
}
\item{y.julian}{
logical flag (which will only be considered if y.axis == 'auto' and
(!is.na(y.var) & y.var.type== 'd').  If F (default), will convert julian
numeric dates or S-Plus dates objects into 'mm/dd/yy' format
for the y-axis labels.  If T, dates will be printed in
julian (numeric) format.
}
\item{y.lim.extend}{
two-dimensional vector representing the number of units that the user
wants to increase ylim on bottom and top of y-axis, respectively.
Default = c(0,0).  This argument will not take effect if the Goldman chart
is utilized.
}
\item{y.lab}{
single label to be used for entire y-axis.  Default will be the variable name
or column number of y.idlabels (if non-missing) and blank otherwise.


}
\item{x.axis.all}{
logical flag. If T (default), lower and upper limits of x-axis will be
based on all observations (rows) in matrix or data frame.  If F, lower and
upper limits will be based only on those observations specified by subset.r
(either before or after sorting depending on specification of sort.by and
value of sort.after.subset).
}
\item{x.axis}{
character string specifying whether program will control labelling
of x-axis (with argument 'auto'), or if user will control labelling
(with argument 'custom').  If 'custom' is chosen, user must specify
location and text of labels using x.axis.custom.at and
x.axis.custom.labels arguments, respectively, listed below.
}
\item{x.axis.custom.at}{
user-specified vector of x-axis label locations.
Must be used when x.axis == 'custom'; will not be used otherwise.
}
\item{x.axis.custom.labels}{
user-specified vector of x-axis labels.
Must be used when x.axis == 'custom'; will not be used otherwise.
}
\item{x.julian}{
logical flag (which will only be considered if x.axis == 'auto').
If F (default), will convert julian dates or S-plus dates objects
into 'mm/dd/yy' format for the x-axis labels.  If T, dates will be
printed in julian (numeric) format.  NOTE:  This argument should remain T if
x.reference is specified.
}
\item{x.lim.extend}{
two-dimensional vector representing the number of time units (usually in days)
that the user wants to increase xlim on left-hand side and right-hand
side of x-axis, respectively.  Default = c(0,0).  This argument will not
take effect if the Goldman chart is utilized.
}
\item{x.scale}{
a factor whose reciprocal is multiplied to original units of the
x-axis.  For example, if the original data frame is in units of days,
x.scale = 365 will result in units of years (notwithstanding leap years).
Default is 1.
}
\item{x.lab}{
single label to be used for entire x-axis.  Default will be 'On Study Date'
if x.julian ==FALSE and 'Time on Study' if x.julian = T.


}
\item{line.by}{
column or data frame variable name for plotting unique lines by unique
values of vector (e.g., specify c('arm') to plot unique lines by
treatment arm).  Can take at most one column or variable name.
Default is NA which produces identical lines for each patient.
}
\item{line.lty}{
vector of line types corresponding to ascending order of line.by values.
If line.by is specified, the vector should be the length of
the number of unique values of line.by.
If line.by is NA, only line.lty[1] will be used.
The default is 1.
}
\item{line.lwd}{
vector of line widths corresponding to ascending order of line.by values.
If line.by is specified, the vector should be the length of
the number of unique values of line.by.
If line.by is NA, only line.lwd[1] will be used.
The default is 1.
}
\item{line.col}{
vector of line colors corresponding to ascending order of line.by values.
If line.by is specified, the vector should be the length of
the number of unique values of line.by.
If line.by is NA, only line.col[1] will be used.
The default is 1.
}
\item{line.add}{
a 2xk matrix with k=number of pairs of additional line segments to add.
For example, if it is of interest to draw additional line segments
connecting events one and two, two and three, and four and five,
(possibly with different colors), an appropriate line.add argument would be
matrix(c('first.event','second.event','second.event','third.event',
'fourth.event','fifth.event'), 2, 3).  One line segment
would be drawn between first.event and second.event,
a second line segment would be drawn between second.event and third.event,
and a third line segment would be drawn between fourth.event and fifth.event.
Different line types, widths and colors can be specified (in arguments
listed just below).


The convention use of subset.c and line.add must match (i.e., column name
must be used for both or column number must be used for both).


If line.add != NA, length of line.add.lty, line.add.lwd, and line.add.col
must be the same as number of pairs of additional line segments to add.


NOTE:  The drawing of the original default line
may be suppressed (with line.col = 0),
and line.add can be used to do all the line plotting for the event chart.
}
\item{line.add.lty}{
a kx1 vector corresponding to the columns of line.add; specifies the line
types for the k line segments.
}
\item{line.add.lwd}{
a kx1 vector corresponding to the columns of line.add; specifies the line
widths for the k line segments.
}
\item{line.add.col}{
a kx1 vector corresponding to the columns of line.add; specifies the line
colors for the k line segments.
}
\item{point.pch}{
vector of pch values for points representing each event.  If similar
events are listed in multiple columns (e.g., regular visits or
a recurrent event), repeated pch values may be listed in the
vector (e.g., c(2,4,rep(183,3))).
If length(point.pch) < length(subset.c), point.pch will be repeated until
lengths are equal; a warning message will verify this condition.
}
\item{point.cex}{
vector of size of points representing each event.
If length(point.cex) < length(subset.c), point.cex will be repeated until
lengths are equal; a warning message will verify this condition.
}
\item{point.col}{
vector of colors of points representing each event.
If length(point.col) < length(subset.c), point.col will be repeated until
lengths are equal; a warning message will verify this condition.


}
\item{legend.plot}{
logical flag;  if T, a legend will be plotted.  Location of legend will
be based on specification of legend.location along with values of other
arguments listed below.  Default is F (i.e., no legend plotting).
}
\item{legend.location}{
will be used only if legend.plot=T.
If 'o' (default), a one-page legend will precede the output of the chart.
The user will need to hit <enter> in order for the event chart to be displayed.
This feature is possible due to the \bold{dev.ask }option.
If 'i', an internal legend will be placed in the plot region
based on legend.point.at.  If 'l', a legend will be placed in the plot region
using the locator option.  Legend will map points to events (via column
names, by default) and, if line.by is specified, lines to groups (based on
levels of line.by).
}
\item{legend.titl}{
title for the legend; default is title to be used for main plot.
Only used when legend.location = 'o'.
}
\item{legend.titl.cex}{
size of text for legend title.  Only used when legend.location = 'o'.
}
\item{legend.titl.line}{
line location of legend title dictated by mtext function with outer=FALSE option;
default is 1.0.  Only used when legend.location = 'o'.
}
\item{legend.point.at}{
location of upper left and lower right corners of legend area to
be utilized for describing events via points and text.
}
\item{legend.point.pch}{
vector of pch values for points representing each event in the legend.
Default is point.pch.
}
\item{legend.point.text}{
text to be used for describing events;  the default is setup for a data frame,
as it will print the names of the columns specified by subset.c .
}
\item{legend.cex}{
size of text for points and event descriptions.  Default is 2.5 which is setup
for legend.location = 'o'.  A much smaller cex is recommended (possibly 0.75)
for use with legend.location = 'i' or legend.location = 'l'.
}
\item{legend.bty}{
option to put a box around the legend(s); default is to have no box
(legend.bty = 'n').  Option legend.bty = 'o' will produce a legend box.
}
\item{legend.line.at}{
if line.by was specified (with legend.location = 'o' or legend.location = 'i'),
this argument will dictate the location of the upper left and lower right
corners of legend area to be utilized for describing the different
line.by values (e.g., treatment.arm).  The default is setup for
legend.location == 'o'.
}
\item{legend.line.text}{
text to be used for describing line.by values;  the default are the names
of the unique non-missing line.by values as produced from the table function.
}
\item{legend.line.lwd}{
vector of line widths corresponding to line.by values.
}
\item{legend.loc.num}{
number used for locator argument when legend.locator = 'l'.  If 1 (default),
user is to locate only the top left corner of the legend box.  If 2, user
is to locate both the top left corner and the lower right corner.  This will
be done twice when line.by is specified (once for points and once for lines).
}
\item{event.time}{
the column number in data contains the event time
}
\item{event.code}{
the column number in data contains the event code


}
\item{...}{
additional par arguments for use in main plot.
}}
\section{Side Effects}{
an event chart is created on the current graphics device.
If legend.plot =TRUE and legend.location = 'o',
a one-page legend will precede the event chart.  Please note that par
parameters on completion of function will be reset to par parameters
existing prior to start of function.
}
\details{
if you want to put, say, two eventcharts side-by-side, in a plot
region, you should not set up par(mfrow=c(1,2)) before running the
first plot.  Instead, you should add the argument mfg=c(1,1,1,2)
to the first plot call followed by the argument mfg=c(1,2,1,2)
to the second plot call.


if dates in original data frame are in a specialized form
(eg., mm/dd/yy) of mode CHARACTER, the user must convert those columns to
become class dates or julian numeric mode (see ?dates for more information).
For example, in a data frame called testdata, with specialized
dates in columns 4 thru 10, the following code could be used:
as.numeric(dates(testdata[,4:10]).  This will convert the columns
to numeric julian dates based on the function's default origin
of January 1, 1960.  If original dates are in class dates or julian form,
no extra work is necessary.


In the survival analysis, the data typically come  in  two
columns: one column containing survival time and the other
containing  censoring  indicator  or   event   code.   The
event.convert  function  converts  this  type of data into
multiple columns of event times, one column of each  event
type, suitable for the event.chart function.
}
\author{
J. Jack Lee and Kenneth R. Hess
\cr
Department of Biostatistics
\cr
University of Texas
\cr
M.D. Anderson Cancer Center
\cr
Houston, TX 77030
\cr
jjlee@mdanderson.org, khess@mdanderson.org


Joel A. Dubin
\cr
Division of Biostatistics
\cr
Department of Epidemiology and Public Health
\cr
Yale University
\cr
joel.dubin@yale.edu
}
\references{
Lee J.J., Hess, K.R., Dubin, J.A. (2000).  Extensions and applications
of event charts.
\emph{The American Statistician,}
\bold{54:1}, 63--70.


Dubin, J.A., Lee, J.J., Hess, K.R. (1997).
The Utility of Event Charts.
\emph{Proceedings of the Biometrics Section, American}
Statistical Association.


Dubin, J.A., Muller H-G, Wang J-L (2001).
Event history graphs for censored survival data.
\emph{Statistics in Medicine,}
\bold{20:} 2951--2964.


Goldman, A.I. (1992).
EVENTCHARTS:  Visualizing Survival and Other Timed-Events Data.
\emph{The American Statistician,}
\bold{46:1}, 13--18.
}

\seealso{\code{\link{event.history}}}

\examples{
# The sample data set is an augmented CDC AIDS dataset (ASCII)
# which is used in the examples in the help file.  This dataset is 
# described in Kalbfleisch and Lawless (JASA, 1989).
# Here, we have included only children 4 years old and younger.
# We have also added a new field, dethdate, which
# represents a fictitious death date for each patient.  There was
# no recording of death date on the original dataset.
#   
# All dates are julian with julian=0 being 
# January 1, 1960, and julian=14000 being 14000 days beyond
# January 1, 1960 (i.e., May 1, 1998).


cdcaids <- data.frame(
age=c(4,2,1,1,2,2,2,4,2,1,1,3,2,1,3,2,1,2,4,2,2,1,4,2,4,1,4,2,1,1,3,3,1,3),
infedate=c(
7274,7727,7949,8037,7765,8096,8186,7520,8522,8609,8524,8213,8455,8739,
8034,8646,8886,8549,8068,8682,8612,9007,8461,8888,8096,9192,9107,9001,
9344,9155,8800,8519,9282,8673),
diagdate=c(
8100,8158,8251,8343,8463,8489,8554,8644,8713,8733,8854,8855,8863,8983,
9035,9037,9132,9164,9186,9221,9224,9252,9274,9404,9405,9433,9434,9470,
9470,9472,9489,9500,9585,9649),
diffdate=c(
826,431,302,306,698,393,368,1124,191,124,330,642,408,244,1001,391,246,
615,1118,539,612,245,813,516,1309,241,327,469,126,317,689,981,303,976),
dethdate=c(
8434,8304,NA,8414,8715,NA,8667,9142,8731,8750,8963,9120,9005,9028,9445,
9180,9189,9406,9711,9453,9465,9289,9640,9608,10010,9488,9523,9633,9667,
9547,9755,NA,9686,10084),
censdate=c(
NA,NA,8321,NA,NA,8519,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA,NA,NA,NA,NA,NA,10095,NA,NA))


cdcaids <- upData(cdcaids,
 labels=c(age     ='Age, y', infedate='Date of blood transfusion',
          diagdate='Date of AIDS diagnosis',
          diffdate='Incubation period (days from HIV to AIDS)',
          dethdate='Fictitious date of death',
          censdate='Fictitious censoring date'))


# Note that the style options listed with these
# examples are best suited for output to a postscript file (i.e., using
# the postscript function with horizontal=TRUE) as opposed to a graphical
# window (e.g., motif).


# To produce simple calendar event chart (with internal legend):
# postscript('example1.ps', horizontal=TRUE)
 event.chart(cdcaids,
  subset.c=c('infedate','diagdate','dethdate','censdate'),
  x.lab = 'observation dates',
  y.lab='patients (sorted by AIDS diagnosis date)',
  titl='AIDS data calendar event chart 1',
  point.pch=c(1,2,15,0), point.cex=c(1,1,0.8,0.8),
  legend.plot=TRUE, legend.location='i', legend.cex=1.0,
  legend.point.text=c('transfusion','AIDS diagnosis','death','censored'),
  legend.point.at = list(c(7210, 8100), c(35, 27)), legend.bty='o')


# To produce simple interval event chart (with internal legend):
# postscript('example2.ps', horizontal=TRUE)
 event.chart(cdcaids,
  subset.c=c('infedate','diagdate','dethdate','censdate'),
  x.lab = 'time since transfusion (in days)',
  y.lab='patients (sorted by AIDS diagnosis date)',
  titl='AIDS data interval event chart 1',
  point.pch=c(1,2,15,0), point.cex=c(1,1,0.8,0.8),
  legend.plot=TRUE, legend.location='i', legend.cex=1.0,
  legend.point.text=c('transfusion','AIDS diagnosis','death','censored'),
  x.reference='infedate', x.julian=TRUE,
  legend.bty='o', legend.point.at = list(c(1400, 1950), c(7, -1)))


# To produce more complicated interval chart which is
# referenced by infection date, and sorted by age and incubation period:
# postscript('example3.ps', horizontal=TRUE)
 event.chart(cdcaids,
  subset.c=c('infedate','diagdate','dethdate','censdate'),
  x.lab = 'time since diagnosis of AIDS (in days)',
  y.lab='patients (sorted by age and incubation length)',
  titl='AIDS data interval event chart 2 (sorted by age, incubation)',
  point.pch=c(1,2,15,0), point.cex=c(1,1,0.8,0.8),
  legend.plot=TRUE, legend.location='i',legend.cex=1.0,
  legend.point.text=c('transfusion','AIDS diagnosis','death','censored'),
  x.reference='diagdate', x.julian=TRUE, sort.by=c('age','diffdate'),
  line.by='age', line.lty=c(1,3,2,4), line.lwd=rep(1,4), line.col=rep(1,4),
  legend.bty='o', legend.point.at = list(c(-1350, -800), c(7, -1)),
  legend.line.at = list(c(-1350, -800), c(16, 8)),
  legend.line.text=c('age = 1', '       = 2', '       = 3', '       = 4'))


# To produce the Goldman chart:
# postscript('example4.ps', horizontal=TRUE)
 event.chart(cdcaids,
  subset.c=c('infedate','diagdate','dethdate','censdate'),
  x.lab = 'time since transfusion (in days)', y.lab='dates of observation',
  titl='AIDS data Goldman event chart 1',
  y.var = c('infedate'), y.var.type='d', now.line=TRUE, y.jitter=FALSE,
  point.pch=c(1,2,15,0), point.cex=c(1,1,0.8,0.8), mgp = c(3.1,1.6,0),
  legend.plot=TRUE, legend.location='i',legend.cex=1.0,
  legend.point.text=c('transfusion','AIDS diagnosis','death','censored'),
  x.reference='infedate', x.julian=TRUE,
  legend.bty='o', legend.point.at = list(c(1500, 2800), c(9300, 10000)))


# To convert coded time-to-event data, then, draw an event chart:
surv.time <- c(5,6,3,1,2)
cens.ind   <- c(1,0,1,1,0)
surv.data  <- cbind(surv.time,cens.ind)
event.data <- event.convert(surv.data)
event.chart(cbind(rep(0,5),event.data),x.julian=TRUE,x.reference=1)
}
\keyword{hplot}
\keyword{survival}
% Converted by Sd2Rd version 1.21.

\eof
\name{event.history}
\alias{event.history}
\title{Produces event.history graph for survival data}
\description{
Produces an event history graph for right-censored survival data,
including time-dependent covariate status, as described in
Dubin, Muller, and Wang (2001).  Effectively,
a Kaplan-Meier curve is produced with supplementary information
regarding individual survival information, censoring information, and
status over time of an individual time-dependent covariate or 
time-dependent covariate function for both uncensored and censored 
individuals.   
}
\usage{
event.history(data, survtime.col, surv.col,
              surv.ind = c(1, 0), subset.rows = NULL,
              covtime.cols = NULL, cov.cols = NULL,
              num.colors = 1, cut.cov = NULL, colors = 1,
              cens.density = 10, mult.end.cens = 1.05,
              cens.mark.right =FALSE, cens.mark = "-",
              cens.mark.ahead = 0.5, cens.mark.cutoff = -1e-08,
              cens.mark.cex = 1,
              x.lab = "time under observation",
              y.lab = "estimated survival probability",
              title = "event history graph", ...)
}
\arguments{
\item{data}{A matrix or data frame with rows corresponding to units
	(often individuals) and columns corresponding to survival time,
event/censoring indicator.  Also, multiple columns may be devoted to
time-dependent covariate level and time change.  }
  \item{survtime.col}{Column (in data) representing minimum of time-to-event or 
right-censoring time for individual.
}
  \item{surv.col}{Column (in data) representing event indicator for an individual.
Though, traditionally, such an indicator will be 1 for an event and
0 for a censored observation, this indicator can be represented 
by any two numbers, made explicit by the surv.ind argument.
}
  \item{surv.ind}{Two-element vector representing, respectively, the 
number for an event, as listed in surv.col, 
followed by the number for a censored
observation.  Default is traditional survival data 
represention, i.e., c(1,0).
}
  \item{subset.rows}{Subset of rows of original matrix or data frame (data) to 
place in event history graph.
Logical arguments may be used here (e.g., treatment.arm == 'a', if
the data frame, data, has been attached to the search directory; 
}
  \item{covtime.cols}{Column(s) (in data) representing the time when change of time-dependent 
covariate (or time-dependent covariate function) occurs.  
There should be a unique non-NA entry in the column for each such change 
(along with corresponding cov.cols column entry representing 
the value of the covariate or function at that change time).  
Default is NULL, meaning no time-dependent covariate information 
will be presented in the graph.  
}
  \item{cov.cols}{Column(s) (in data) representing the level of the time-dependent 
covariate (or time-dependent covariate function).  There should be 
a unique non-NA column entry representing each change in the level 
(along with a corresponding covtime.cols column entry representing 
the time of the change).  Default is NULL, meaning
no time-dependent covariate information will be presented in
the graph. 
}
  \item{num.colors}{Colors are utilized for the time-dependent covariate level for an
individual.  This argument provides the number of unique covariate
levels which will be displayed by mapping the number of colors 
(via num.colors) to the number of desired covariate levels.  
This will divide the covariate span into roughly equally-sized 
intervals, via the S-Plus cut function.
Default is one color, meaning no time-dependent information
will be presented in the graph.  Note that this argument will
be ignored/superceded if a non-NULL argument is provided for the
cut.cov parameter.
}
  \item{cut.cov}{This argument allows the user to explicitly state how to 
define the intervals for the time-dependent covariate, such that
different colors will be allocated to the user-defined covariate levels.
For example, for plotting five colors, six ordered points within the 
span of the data's covariate levels should be provided.
Default is NULL, meaning that the num.colors argument value
will dictate the number of breakpoints, with the covariate span
defined into roughly equally-sized intervals via the S-Plus cut
function.  However, if is.null(cut.cov) ==FALSE, 
then this argument supercedes any entry for the num.colors argument.
}
  \item{colors}{This is a vector argument defining the actual colors used 
for the time-dependent covariate levels in the plot, with the
index of this vector corresponding to the ordered levels
of the covariate.  The number of colors (i.e., the length
of the colors vector) should correspond to the 
value provided to the num.colors argument or the number 
of ordered points - 1 as defined in the cut.cov argument
(with cut.cov superceding num.colors if is.null(cut.cov) ==FALSE).  
The function, as currently written, allows for as much as 
twenty distinct colors.  This argument effectively feeds
into the col argument for the S-Plus polygon function.  
Default is colors=1.  See the col argument for the both the 
S-Plus par function and polygon function for more information.
}
  \item{cens.density}{This will provide the shading density at the end of the 
individual bars for those who are censored.  For more information
on shading density, see the density argument in the S-Plus
polygon function.  Default is cens.density=10.
}
  \item{mult.end.cens}{This is a multiplier that extends the length of 
the longest surviving individual bar (or bars, if a tie exists) 
if right-censored, presuming that no event times eventually follow this
final censored time.  Default extends the length 5 percent beyond 
the length of the observed right-censored survival time.
}
  \item{cens.mark.right}{A logical argument that states whether an explicit mark 
should be placed to the right of the individual right-censored 
survival bars.  This argument is most useful for
large sample sizes, where it may be hard to detect the special 
shading via cens.density, particularly for the short-term survivors.
}
  \item{cens.mark}{Character argument which describes the censored mark that should be
used if cens.mark.right = T.  Default is '-'.  
}
  \item{cens.mark.ahead}{A numeric argument, which specifies the absolute distance
to be placed between the individual right-censored
survival bars and the mark as defined in the above cens.mark
argument.  Default is .5 (that is, a half of day, if
survival time is measured in days), but may very well need
adjusting depending on the maximum survival time
observed in the dataset.
}
  \item{cens.mark.cutoff}{A negative number very close to 0 
(by default cens.mark.cutoff = -1e-8) to ensure that 
the censoring marks get plotted correctly.  See event.history
code in order to see its usage.  This argument typically will not
need adjustment.
}
  \item{cens.mark.cex}{Numeric argument defining the size of the mark defined in 
the cens.mark argument above.  See more information 
by viewing the cex argument for the S-Plus par function.
Default is cens.mark.cex=1.0.
}
  \item{x.lab}{Single label to be used for entire x-axis.  
Default is 'time under observation'. 
}
  \item{y.lab}{Single label to be used for entire y-axis.  
Default is 'estimated survival probability'. 
}
  \item{title}{Title for the event history graph.  
Default is 'event history graph'.
}
  \item{\dots}{This allows arguments to the plot function call within 
the event.history function.  
So, for example, the axes representations can be manipulated
with appropriate arguments, or particular areas of the event.history 
graph can be "zoomed".  See the details section for more 
comments about zooming.  
}
}
\details{
In order to focus on a particular area of the event history graph,
zooming can be performed.  This is best done by 
specifying appropriate xlim and ylim 
arguments at the end of the event.history function call, 
taking advantage of the ... argument link to the plot function.
An example of zooming can be seen
in Plate 4 of the paper referenced below.

Please read the reference below to understand how the
individual covariate and survival information is provided in the plot,
how ties are handled, how right-censoring is handled, etc.
}
\references{Dubin, J.A., Muller, H.-G., and Wang, J.-L. (2001).
Event history graphs for censored survival data.
\emph{Statistics in Medicine}, \bold{20}, 2951-2964.
}
\author{Joel Dubin\cr
  joel.dubin@yale.edu}

\note{The authors have found better control of the use of color by 
producing the graphs via the postscript plotting device
in S-Plus.  In fact, the provided examples utilize 
the postscript function.
However, your past experiences may be different, 
and you may prefer to control color directly (to the graphsheet
in Windows environment, for example).  The event.history
function will work with either approach.
}

\section{WARNING}{This function has been tested thoroughly, but only within 
a restricted version and environment, 
i.e., only within S-Plus 2000, Version 3, and within S-Plus 6.0,
version 2, both on a Windows 2000 machine.  
Hence, we cannot currently vouch
for the function's effectiveness 
in other versions of S-Plus (e.g., S-Plus 3.4) 
nor in other operating environments (e.g., Windows 95, Linux or Unix).
The function has also been verified to work on R under Linux.
}

\seealso{\code{\link{plot}},\code{\link{polygon}}, \code{\link{event.chart}}}

\examples{
# Code to produce event history graphs for SIM paper
#
# before generating plots, some pre-processing needs to be performed,
#  in order to get dataset in proper form for event.history function;
#  need to create one line per subject and sort by time under observation, 
#  with those experiencing event coming before those tied with censoring time;
if(.R.) {  # get access to heart data frame
  require('survival')
  data(heart)
}

# creation of event.history version of heart dataset (call heart.one):

heart.one <- matrix(nrow=length(unique(heart$id)), ncol=8)
for(i in 1:length(unique(heart$id)))
 {
  if(length(heart$id[heart$id==i]) == 1)
   heart.one[i,] <- as.numeric(unlist(heart[heart$id==i, ]))
  else if(length(heart$id[heart$id==i]) == 2)
   heart.one[i,] <- as.numeric(unlist(heart[heart$id==i,][2,]))
 }

heart.one[,3][heart.one[,3] == 0] <- 2 	## converting censored events to 2, from 0
if(is.factor(heart$transplant))
 heart.one[,7] <- heart.one[,7] - 1
 ## getting back to correct transplantation coding
heart.one <- as.data.frame(heart.one[order(unlist(heart.one[,2]), unlist(heart.one[,3])),])
names(heart.one) <- names(heart)
# back to usual censoring indicator:
heart.one[,3][heart.one[,3] == 2] <- 0 
# note: transplant says 0 (for no transplants) or 1 (for one transplant)
#        and event = 1 is death, while event = 0 is censored

# plot single Kaplan-Meier curve from heart data, first creating survival object
heart.surv <- survfit(Surv(heart.one$stop, heart.one$event), conf.int = FALSE)

# figure 3: traditional Kaplan-Meier curve
# postscript('ehgfig3.ps', horiz=TRUE)
# omi <- par(omi=c(0,1.25,0.5,1.25))
 plot(heart.surv, ylab='estimated survival probability',
      xlab='observation time (in days)')
 title('Figure 3: Kaplan-Meier curve for Stanford data', cex=0.8)
# dev.off()

## now, draw event history graph for Stanford heart data; use as Figure 4

# postscript('ehgfig4.ps', horiz=TRUE, colors = seq(0, 1, len=20))
# par(omi=c(0,1.25,0.5,1.25))
 event.history(heart.one, 
		survtime.col=heart.one[,2], surv.col=heart.one[,3],
		covtime.cols = cbind(rep(0, dim(heart.one)[1]), heart.one[,1]),
		cov.cols = cbind(rep(0, dim(heart.one)[1]), heart.one[,7]),
		num.colors=2, colors=c(6,10),
		x.lab = 'time under observation (in days)',
		title='Figure 4: Event history graph for\nStanford data',
		cens.mark.right =TRUE, cens.mark = '-', 
		cens.mark.ahead = 30.0, cens.mark.cex = 0.85)
# dev.off()



# now, draw age-stratified event history graph for Stanford heart data; 
#  use as Figure 5

# two plots, stratified by age status
# postscript('c:\\temp\\ehgfig5.ps', horiz=TRUE, colors = seq(0, 1, len=20))
# par(omi=c(0,1.25,0.5,1.25))
 par(mfrow=c(1,2))

 event.history(data=heart.one, subset.rows = (heart.one[,4] < 0),
		survtime.col=heart.one[,2], surv.col=heart.one[,3],
		covtime.cols = cbind(rep(0, dim(heart.one)[1]), heart.one[,1]),
		cov.cols = cbind(rep(0, dim(heart.one)[1]), heart.one[,7]),
		num.colors=2, colors=c(6,10),  
		x.lab = 'time under observation\n(in days)',
		title = 'Figure 5a:\nStanford data\n(age < 48)',
		cens.mark.right =TRUE, cens.mark = '-', 
		cens.mark.ahead = 40.0, cens.mark.cex = 0.85,
		xlim=c(0,1900))

 event.history(data=heart.one, subset.rows = (heart.one[,4] >= 0),
		survtime.col=heart.one[,2], surv.col=heart.one[,3],
		covtime.cols = cbind(rep(0, dim(heart.one)[1]), heart.one[,1]),
		cov.cols = cbind(rep(0, dim(heart.one)[1]), heart.one[,7]),
		num.colors=2, colors=c(6,10),
		x.lab = 'time under observation\n(in days)',
		title = 'Figure 5b:\nStanford data\n(age >= 48)',
		cens.mark.right =TRUE, cens.mark = '-', 
		cens.mark.ahead = 40.0, cens.mark.cex = 0.85,
		xlim=c(0,1900))
# dev.off()
# par(omi=omi)

# we will not show liver cirrhosis data manipulation, as it was 
#  a bit detailed; however, here is the 
#  event.history code to produce Figure 7 / Plate 1

# Figure 7 / Plate 1 : prothrombin ehg with color
\dontrun{
second.arg <- 1				### second.arg is for shading
third.arg <- c(rep(1,18),0,1)		### third.arg is for intensity

# postscript('c:\\temp\\ehgfig7.ps', horiz=TRUE, 
# colors = cbind(seq(0, 1, len = 20), second.arg, third.arg)) 
# par(omi=c(0,1.25,0.5,1.25), col=19)
 event.history(cirrhos2.eh, subset.rows = NULL,
               survtime.col=cirrhos2.eh$time, surv.col=cirrhos2.eh$event,
		covtime.cols = as.matrix(cirrhos2.eh[, ((2:18)*2)]),
		cov.cols = as.matrix(cirrhos2.eh[, ((2:18)*2) + 1]),
		cut.cov =  as.numeric(quantile(as.matrix(cirrhos2.eh[, ((2:18)*2) + 1]),
				c(0,.2,.4,.6,.8,1), na.rm=TRUE) + c(-1,0,0,0,0,1)),	
 		colors=c(20,4,8,11,14),
		x.lab = 'time under observation (in days)',
		title='Figure 7: Event history graph for liver cirrhosis data (color)',
		cens.mark.right =TRUE, cens.mark = '-', 
		cens.mark.ahead = 100.0, cens.mark.cex = 0.85)
# dev.off()
}
}
\keyword{survival}

\eof
\name{find.matches}
\alias{find.matches}
\alias{summary.find.matches}
\alias{print.find.matches}
\alias{matchCases}
\title{
Find Close Matches
}
\description{
Compares each row in \code{x} against all the rows in \code{y}, finding rows in
\code{y} with all columns within a tolerance of the values a given row of
\code{x}.  The default tolerance
\code{tol} is zero, i.e., an exact match is required on all columns.
For qualifying matches, a distance measure is computed.  This is
the sum of squares of differences between \code{x} and \code{y} after scaling
the columns.  The default scaling values are \code{tol}, and for columns
with \code{tol=1} the scale values are set to 1.0 (since they are ignored
anyway).  Matches (up to \code{maxmatch} of them) are stored and listed in order of 
increasing distance.
\cr
The \code{summary} method prints a frequency distribution of the
number of matches per observation in \code{x}, the median of the minimum
distances for all matches per \code{x}, as a function of the number of matches,
and the frequency of selection of duplicate observations as those having
the smallest distance.  The \code{print} method prints the entire \code{matches}
and \code{distance} components of the result from \code{find.matches}.
\cr
\code{matchCases} finds all controls that match cases on a single variable
\code{x} within a tolerance of \code{tol}.  This is intended for prospective
cohort studies that use matching for confounder adjustment (even
though regression models usually work better).
}
\usage{
find.matches(x, y, tol=rep(0, ncol(y)), scale=tol, maxmatch=10)
\method{summary}{find.matches}(object, \dots)
\method{print}{find.matches}(x, digits, \dots)

matchCases(xcase,    ycase,    idcase=names(ycase),
           xcontrol, ycontrol, idcontrol=names(ycontrol),
           tol=NULL,
           maxobs=max(length(ycase),length(ycontrol))*10,
           maxmatch=20, which=c('closest','random'))
}
\arguments{
\item{x}{
a numeric matrix or the result of \code{find.matches}
}
\item{y}{
a numeric matrix with same number of columns as \code{x}
}
\item{xcase}{
}
\item{xcontrol}{
vectors, not necessarily of the same length, specifying a numeric
variable used to match cases and control
}
\item{ycase}{
}
\item{ycontrol}{
vectors or matrices, not necessarily having the same number of rows,
specifying a variable to carry along from cases and matching
controls.  If you instead want to carry along rows from a data frame,
let \code{ycase} and \code{ycontrol} be non-overlapping integer subscripts of
the donor data frame.
}
\item{tol}{
a vector of tolerances with number of elements the same as the number
of columns of \code{y}, for \code{find.matches}.  For \code{matchCases}
is a scalar tolerance.
}
\item{scale}{
a vector of scaling constants with number of elements the same as the
number of columns of \code{y}.
}
\item{maxmatch}{
maximum number of matches to allow.  For \code{matchCases},
maximum number of controls to match with a case (default is 20).  If more than
\code{maxmatch} matching controls are available, a random sample without
replacement of \code{maxmatch} controls is used (if \code{which="random"}).
}
\item{object}{an object created by \code{find.matches}}
\item{digits}{
number of digits to use in printing distances
}
\item{idcase}{
}
\item{idcontrol}{
vectors the same length as \code{xcase} and \code{xcontrol} respectively,
specifying the id of cases and controls.  Defaults are integers
specifying original element positions within each of cases and
controls.
}
\item{maxobs}{
maximum number of cases and all matching controls combined (maximum
dimension of data frame resulting from \code{matchControls}).  Default is
ten times the maximum of the number of cases and number of controls.
\code{maxobs} is used to allocate space for the resulting data frame.
}
\item{which}{
set to \code{"closest"} (the default) to match cases with up to \code{maxmatch}
controls that most closely match on \code{x}.  Set \code{which="random"} to use
randomly chosen controls.  In either case, only those controls within
\code{tol} on \code{x} are allowed to be used.
}
\item{\dots}{unused}
}
\value{
\code{find.matches} returns a list of class \code{find.matches} with elements
\code{matches} and \code{distance}. 
Both elements are matrices with the number of rows equal to the number
of rows in \code{x}, and with \code{k} columns, where \code{k} is the maximum number of
matches (\code{<= maxmatch}) that occurred.  The elements of \code{matches}
are row identifiers of \code{y} that match, with zeros if fewer than
\code{maxmatch} matches are found (blanks if \code{y} had row names).
\code{matchCases} returns a data frame with variables \code{idcase} (id of case
currently being matched), \code{type} (factor variable with levels \code{"case"}
and \code{"control"}), \code{id} (id of case if case row, or id of matching
case), and \code{y}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\references{
Ming K, Rosenbaum PR (2001): A note on optimal matching with variable
controls using the assignment algorithm.  J Comp Graph Stat
10:455--463.

Cepeda MS, Boston R, Farrar JT, Strom BL (2003): Optimal matching with a
variable number of controls vs. a fixed number of controls for a cohort
study: trade-offs.  J Clin Epidemiology 56:230-237.
Note: These papers were not used for the functions here but
probably should have been.

}
\seealso{
\code{\link{scale}}, \code{\link{apply}}
}
\examples{
y <- rbind(c(.1, .2),c(.11, .22), c(.3, .4), c(.31, .41), c(.32, 5))
x <- rbind(c(.09,.21), c(.29,.39))
y
x
w <- find.matches(x, y, maxmatch=5, tol=c(.05,.05))


set.seed(111)       # so can replicate results
x <- matrix(runif(500), ncol=2)
y <- matrix(runif(2000), ncol=2)
w <- find.matches(x, y, maxmatch=5, tol=c(.02,.03))
w$matches[1:5,]
w$distance[1:5,]
# Find first x with 3 or more y-matches
num.match <- apply(w$matches, 1, function(x)sum(x > 0))
j <- ((1:length(num.match))[num.match > 2])[1]
x[j,]
y[w$matches[j,],]


summary(w)


# For many applications would do something like this:
# attach(df1)
# x <- cbind(age, sex) # Just do as.matrix(df1) if df1 has no factor objects
# attach(df2)
# y <- cbind(age, sex)
# mat <- find.matches(x, y, tol=c(5,0)) # exact match on sex, 5y on age


# Demonstrate matchCases
xcase     <- c(1,3,5,12)
xcontrol  <- 1:6
idcase    <- c('A','B','C','D')
idcontrol <- c('a','b','c','d','e','f')
ycase     <- c(11,33,55,122)
ycontrol  <- c(11,22,33,44,55,66)
matchCases(xcase, ycase, idcase,
           xcontrol, ycontrol, idcontrol, tol=1)


# If y is a binary response variable, the following code
# will produce a Mantel-Haenszel summary odds ratio that 
# utilizes the matching.
# Standard variance formula will not work here because
# a control will match more than one case
# WARNING: The M-H procedure exemplified here is suspect 
# because of the small strata and widely varying number
# of controls per case.


x    <- c(1, 2, 3, 3, 3, 6, 7, 12,  1, 1:7)
y    <- c(0, 0, 0, 1, 0, 1, 1,  1,  1, 0, 0, 0, 0, 1, 1, 1)
case <- c(rep(TRUE, 8), rep(FALSE, 8))
id   <- 1:length(x)


m <- matchCases(x[case],  y[case],  id[case],
                x[!case], y[!case], id[!case], tol=1)
iscase <- m$type=='case'
# Note: the first tapply on insures that event indicators are
# sorted by case id.  The second actually does something.
event.case    <- tapply(m$y[iscase],  m$idcase[iscase],  sum)
event.control <- tapply(m$y[!iscase], m$idcase[!iscase], sum)
n.control     <- tapply(!iscase,      m$idcase,          sum)
n             <- tapply(m$y,          m$idcase,          length)
or <- sum(event.case * (n.control - event.control) / n) /
      sum(event.control * (1 - event.case) / n)
or


# Bootstrap this estimator by sampling with replacement from
# subjects.  Assumes id is unique when combine cases+controls
# (id was constructed this way above).  The following algorithms
# puts all sampled controls back with the cases to whom they were
# originally matched.


ids <- unique(m$id)
idgroups <- split(1:nrow(m), m$id)
B   <- 50   # in practice use many more
ors <- numeric(B)
# Function to order w by ids, leaving unassigned elements zero
align <- function(ids, w) {
  z <- structure(rep(0, length(ids)), names=ids)
  z[names(w)] <- w
  z
}
for(i in 1:B) {
  j <- sample(ids, replace=TRUE)
  obs <- unlist(idgroups[j])
  u <- m[obs,]
  iscase <- u$type=='case'
  n.case <- align(ids, tapply(u$type, u$idcase, 
                              function(v)sum(v=='case')))
  n.control <- align(ids, tapply(u$type, u$idcase,
                                 function(v)sum(v=='control')))
  event.case <- align(ids, tapply(u$y[iscase],  u$idcase[iscase],  sum))
  event.control <- align(ids, tapply(u$y[!iscase], u$idcase[!iscase], sum))
  n <- n.case + n.control
  # Remove sets having 0 cases or 0 controls in resample
  s             <- n.case > 0 & n.control > 0
  denom <- sum(event.control[s] * (n.case[s] - event.case[s]) / n[s])
  or <- if(denom==0) NA else 
   sum(event.case[s] * (n.control[s] - event.control[s]) / n[s]) / denom
  ors[i] <- or
}
describe(ors)
}
\keyword{math}
\keyword{multivariate}
\keyword{htest}
\concept{bootstrap}
\concept{matching}
\concept{epidemiology}
\concept{case-control}

\eof
\name{first.word}
\alias{first.word}
\title{First Word in a String or Expression}
\description{
\code{first.word} finds the first word in an expression.  A word is defined by
unlisting the elements of the expression found by the S parser and then
accepting any elements whose first character is either a letter or period.
The principal intended use is for the automatic generation of temporary
file names where it is important to exclude special characters from
the file name. For Microsoft Windows, periods in names are deleted and
only up to the first 8 characters of the word is returned.
}
\usage{
first.word(x, i=1, expr=substitute(x))
}
\arguments{
\item{x}{
any scalar character string
}
\item{i}{
word number, default value = 1.  Used when the second or \code{i}th word is
wanted.  Currently only the \code{i=1} case is implemented.
}
\item{expr}{
any S object of mode \code{expression}.
}
}
\value{
a character string
}
\author{
Frank E. Harrell, Jr.,
\cr
Department of Biostatistics,
\cr
Vanderbilt University,
\cr
\code{f.harrell@vanderbilt.edu}


Richard M. Heiberger,
\cr
Department of Statistics,
\cr
Temple University, Philadelphia, PA.
\cr
\code{rmh@astro.ocis.temple.edu}
}
\examples{
first.word(expr=expression(y ~ x + log(w)))
}
\keyword{character}
\keyword{manip}

\eof
\name{format.df}
\alias{format.df}
\title{
Format a Data Frame or Matrix for LaTeX or HTML
}
\description{
\code{format.df} does appropriate rounding and decimal alignment, and outputs
a character matrix containing the formatted data.  If \code{x} is a
data.frame, then do each component separately.
If \code{x} is a matrix, but not a data.frame, make it a data.frame
with individual components for the columns.
If a component \code{x$x} is a matrix, then do all columns the same.
}

\usage{
format.df(x,
          digits, dec=NULL, rdec=NULL, cdec=NULL,
          numeric.dollar=cdot,
          na.blank=FALSE, na.dot=FALSE, blank.dot=FALSE,
          col.just=NULL, cdot=FALSE, dcolumn=FALSE, matrix.sep=' ', 
          scientific=c(-4,4), \dots)
}
\arguments{
\item{x}{
a matrix (usually numeric) or data frame
}
\item{digits}{
causes all values in the table to be formatted to \code{digits} significant
digits.  \code{dec} is usually preferred.
}
\item{dec}{
If \code{dec} is a scalar, all elements of the matrix will be rounded
to \code{dec} 
decimal places to the right of the decimal. \code{dec} can also be a matrix 
whose elements correspond to \code{x}, for customized rounding of each element.
A matrix \code{dec} must have number of columns equal to number of columns
of input \code{x}.
A scalar \code{dec} is expanded to a vector \code{cdec} with number of
items equal to number of columns of input \code{x}.
}
\item{rdec}{
a vector specifying the number of decimal places to the right for each row 
(\code{cdec} is more commonly used than \code{rdec})
A vector \code{rdec} must have number of items equal to number of rows of input \code{x}.
\code{rdec} is expanded to matrix \code{dec}.
}
\item{cdec}{
a vector specifying the number of decimal places for each column.
The vector must have number of items equal to number of columns or components
of input x.
}
\item{cdot}{
Set to \code{TRUE} to use centered dots rather than ordinary periods in numbers.
The output uses a syntax appropriate for \code{latex}.
}
\item{na.blank}{
Set to \code{TRUE} to use blanks rather than \code{NA} for missing values.
This usually looks better in \code{latex}.
}
\item{dcolumn}{
Set to \code{TRUE} to use David Carlisle's \code{dcolumn} style for
decimal alignment in \code{latex}.
Default is \code{FALSE}. You will probably want to
use \code{dcolumn} if you use \code{rdec}, as a column may then contain varying
number of places to the right of the decimal. \code{dcolumn} can line up
all such numbers on the decimal point, with integer values right
justified at the decimal point location of numbers that actually
contain decimal places.  When you use \code{dcolumn=TRUE}, 
\code{numeric.dollar} is set by default to \code{FALSE}.  When you use \code{dcolumn=TRUE}, the
\code{"style"} element is set to \code{"dcolumn"} as the \code{latex} \code{\\usepackage}
must reference \code{[dcolumn]}.
The three files \code{dcolumn.sty}, \code{newarray.sty}, and
\code{array.sty} will 
need to be in a directory in your \code{$TEXINPUTS} path.
When you use \code{dcolumn=TRUE}, \code{numeric.dollar} should be set to \code{FALSE}.
}
\item{numeric.dollar}{
logical, default \code{!dcolumn}.  Set to \code{TRUE} to place dollar
signs around numeric values when \code{dcolumn=FALSE}.  This 
assures that \code{latex} will use minus signs rather than hyphens to indicate
negative numbers.  Set to \code{FALSE} when \code{dcolumn=TRUE}, as
\code{dcolumn.sty} automatically uses minus signs.
}
\item{na.dot}{
Set to \code{TRUE} to use periods rather than \code{NA} for missing
numeric values. 
This works with the \code{sas} convention that periods indicate missing values.
}
\item{blank.dot}{
Set to \code{TRUE} to use periods rather than blanks for missing character values.
This works with the \code{sas} convention that periods indicate missing values.
}
\item{col.just}{
  Input vector \code{col.just} must have number of columns equal to
  number of columns of the output matrix.  When \code{NULL}, the
  default, the \code{col.just} attribute of the result is set to
  \code{"l"} for character columns and to \code{"r"} for numeric
  columns.  The user can override the default by an argument vector
  whose length is equal to the number of columns of the result matrix.
  When \code{format.df} is called by \code{latex.default}, the
  \code{col.just} is used as the \code{cols} argument to the
  \code{\tabular} environment and the letters \code{"l"}, \code{"r"},
  and \code{"c"} are valid values.  When \code{format.df} is called by
  \code{sas}, the \code{col.just} is used to determine whether a
  \code{$} is needed on the \code{input} line of the \code{sysin} file,
  and the letters \code{"l"} and \code{"r"} are valid values.  }
\item{matrix.sep}{
When \code{x} is a data frame containing a matrix, so that new column names
are constructed from the name of the matrix object and the names of
the individual columns of the matrix, \code{matrix.sep} specifies the
character to use to separate object names from individual column
names.
}
\item{scientific}{
specifies ranges of exponents (or a logical vector) specifying values
not to convert to scientific notation.  See \code{format.default} for details.
}
\item{...}{
other arguments are accepted and ignored.  For \code{latexVerbatim} these
arguments are passed to the \code{print} function.
}
}
\value{
a character matrix with character images of properly rounded \code{x}.
Matrix components of input \code{x} are now just sets of columns of
character matrix.
\code{attr(,col.just)} repeats the input \code{col.just} when provided,
otherwise, it includes the recommended justification for columns of output.
See the discussion of the argument \code{col.just}.
The default justification is \code{"l"} for characters and factors,
\code{"r"} for numeric.
When \code{dcolumn==TRUE}, numerics will have \code{"."} as the justification character.
}

\author{
Frank E. Harrell, Jr.,
\cr
Department of Biostatistics,
\cr
Vanderbilt University,
\cr
\code{f.harrell@vanderbilt.edu}


Richard M. Heiberger,
\cr
Department of Statistics,
\cr
Temple University, Philadelphia, PA.
\cr
\code{rmh@astro.ocis.temple.edu}


}
\seealso{
\code{\link{latex}}
}
\examples{
x <- data.frame(a=1:2, b=3:4)
x$m <- matrix(5:8,nrow=2)
names(x)
dim(x)
x
format.df(x)
dim(format.df(x))
}
\keyword{utilities}
\keyword{interface}
\keyword{methods}
\keyword{file}
\keyword{character}
\keyword{manip}

\eof
\name{gbayes}
\alias{gbayes}
\alias{plot.gbayes}
\alias{gbayes2}
\alias{plot.gbayes}
\alias{gbayesMixPredNoData}
\alias{gbayesMixPost}
\alias{gbayesMixPowerNP}
\alias{gbayes1PowerNP}
\title{
Gaussian Bayesian Posterior and Predictive Distributions
}
\description{
\code{gbayes} derives the (Gaussian) posterior and optionally the predictive
distribution when both the prior and the likelihood are Gaussian, and
when the statistic of interest comes from a 2-sample problem.
This function is especially useful in obtaining the expected power of
a statistical test, averaging over the distribution of the population
effect parameter (e.g., log hazard ratio) that is obtained using
pilot data.  \code{gbayes} is also useful for summarizing studies for
which the statistic of interest is approximately Gaussian with
known variance.  An example is given for comparing two proportions
using the angular transformation, for which the variance is
independent of unknown parameters except for very extreme probabilities.
A \code{plot} method is also given.  This plots the prior, posterior, and
predictive distributions on a single graph using a nice default for
the x-axis limits and using the \code{labcurve} function for automatic
labeling of the curves.


\code{gbayes2} uses the method of Spiegelhalter and Freedman (1986) to compute the
probability of correctly concluding that a new treatment is superior
to a control.  By this we mean that a 1-\code{alpha} normal
theory-based confidence interval for the new minus old treatment
effect lies wholly to the right of \code{delta.w}, where \code{delta.w} is the
minimally worthwhile treatment effect (which can be zero to be
consistent with ordinary null hypothesis testing, a method not always
making sense).  This kind of power function is averaged over a prior
distribution for the unknown treatment effect.  This procedure is
applicable to the situation where a prior distribution is not to be
used in constructing the test statistic or confidence interval, but is
only used for specifying the distribution of \code{delta}, the parameter of
interest.


Even though \code{gbayes2}
assumes that the test statistic has a normal distribution with known
variance (which is strongly a function of the sample size in the two
treatment groups), the prior distribution function can be completely
general.  Instead of using a step-function for the prior distribution
as Spiegelhalter and Freedman used in their appendix, \code{gbayes2} uses
the built-in \code{integrate} function for numerical integration.
\code{gbayes2} also allows the variance of the test statistic to be general
as long as it is evaluated by the user.  The conditional power given the
parameter of interest \code{delta} is \code{1 - pnorm((delta.w - delta)/sd + z)}, where z
is the normal critical value corresponding to 1 - \code{alpha}/2.

\code{gbayesMixPredNoData} derives the predictive distribution of a
statistic that is Gaussian given \code{delta} when no data have yet been
observed and when the prior is a mixture of two Gaussians.

\code{gbayesMixPost} derives the posterior density or cdf of \code{delta} given
the statistic \code{x}, when the prior for \code{delta} is a mixture of two
Gaussians and when \code{x} is Gaussian given \code{delta}.

\code{gbayesMixPowerNP} computes the power for a test for \code{delta} > \code{delta.w}
for the case where (1) a Gaussian prior or mixture of two Gaussian priors
is used as the prior distribution, (2) this prior is used in forming
the statistical test or credible interval, (3) no prior is used for
the distribution of \code{delta} for computing power but instead a fixed
single \code{delta} is given (as in traditional frequentist hypothesis
tests), and (4) the test statistic has a Gaussian likelihood with
known variance (and mean equal to the specified \code{delta}).
\code{gbayesMixPowerNP} is handy where you want to use an earlier study in
testing for treatment effects in a new study, but you want to mix with
this prior a non-informative prior.  The mixing probability \code{mix} can
be thought of as the "applicability" of the previous study.  As with
\code{gbayes2}, power here means the probability that the new study will
yield a left credible interval that is to the right of \code{delta.w}.
\code{gbayes1PowerNP} is a special case of \code{gbayesMixPowerNP} when the
prior is a single Gaussian.
}
\usage{
gbayes(mean.prior, var.prior, m1, m2, stat, var.stat, 
       n1, n2, cut.prior, cut.prob.prior=0.025)

\method{plot}{gbayes}(x, xlim, ylim, name.stat='z', \dots)

gbayes2(sd, prior, delta.w=0, alpha=0.05, upper=Inf, prior.aux)

gbayesMixPredNoData(mix=NA, d0=NA, v0=NA, d1=NA, v1=NA,
                    what=c('density','cdf'))

gbayesMixPost(x=NA, v=NA, mix=1, d0=NA, v0=NA, d1=NA, v1=NA,
              what=c('density','cdf'))

gbayesMixPowerNP(pcdf, delta, v, delta.w=0, mix, interval,
                 nsim=0, alpha=0.05)

gbayes1PowerNP(d0, v0, delta, v, delta.w=0, alpha=0.05)
}
\arguments{
\item{mean.prior}{
mean of the prior distribution
}
\item{cut.prior}{}
\item{cut.prob.prior}{}
\item{var.prior}{
variance of the prior.  Use a large number such as 10000 to effectively
use a flat (noninformative) prior.  Sometimes it is useful to compute
the variance so that the prior probability that \code{stat} is greater than
some impressive value \code{u} is only \code{alpha}.  The correct
\code{var.prior} to use is then \code{((u-mean.prior)/qnorm(1-alpha))^2}.
You can specify \code{cut.prior=u} and \code{cut.prob.prior=alpha} (whose default is 0.025)
in place of \code{var.prior} to have \code{gbayes} compute the prior variance in this
manner. 
}
\item{m1}{
sample size in group 1
}
\item{m2}{
sample size in group 2
}
\item{stat}{
statistic comparing groups 1 and 2, e.g., log hazard ratio, difference
in means, difference in angular transformations of proportions
}
\item{var.stat}{
variance of \code{stat}, assumed to be known.  \code{var.stat} should either
be a constant (allowed if \code{n1} is not specified), or a function of
two arguments which specify the sample sizes in groups 1 and 2. 
Calculations will be approximate when the variance is estimated from the data.
}
\item{x}{
an object returned by \code{gbayes} or the value of the statistic which
is an estimator of delta, the parameter of interest
}
\item{sd}{
the standard deviation of the treatment effect
}
\item{prior}{
a function of possibly a vector of unknown treatment effects,
returning the prior density at those values
}
\item{pcdf}{
a function computing the posterior CDF of the treatment effect
\code{delta}, such as a function created by \code{gbayesMixPost} with
\code{what="cdf"}.
}
\item{delta}{
a true unknown single treatment effect to detect
}
\item{v}{
the variance of the statistic \code{x}, e.g., \code{s^2 * (1/n1 + 1/n2)}.
Neither \code{x} nor \code{v} need to be defined to
\code{gbayesMixPost}, as they can be defined at run time to the function
created by \code{gbayesMixPost}.
}
\item{n1}{
number of future observations in group 1, for obtaining a predictive
distribution
}
\item{n2}{
number of future observations in group 2
}
\item{xlim}{
vector of 2 x-axis limits.  Default is the mean of the posterior plus or
minus 6 standard deviations of the posterior.
}
\item{ylim}{
vector of 2 y-axis limits.  Default is the range over combined prior and 
posterior densities.
}
\item{name.stat}{
label for x-axis.  Default is \code{"z"}.
}
\item{...}{
optional arguments passed to \code{labcurve} from \code{plot.gbayes}
}
\item{delta.w}{
the minimum worthwhile treatment difference to detech.  The default is
zero for a plain uninteristing null hypothesis.
}
\item{alpha}{
type I error, or more accurately one minus the confidence level for a
two-sided confidence limit for the treatment effect
}
\item{upper}{
upper limit of integration over the prior distribution multiplied by
the normal likelihood for the treatment effect statistic.  Default is
infinity.
}
\item{prior.aux}{
argument to pass to \code{prior} from \code{integrate} through \code{gbayes2}.
Inside of \code{power} the argument must be named \code{prior.aux} if it
exists.  You can pass multiple parameters by passing \code{prior.aux} as a
list and pulling off elements of the list inside \code{prior}.  This setup
was used because of difficulties in passing \code{\dots} arguments through
\code{integrate} for some situations.
}
\item{mix}{
mixing probability or weight for the Gaussian prior having mean \code{d0}
and variance \code{v0}.  \code{mix} must be between 0 and 1, inclusive.
}
\item{d0}{
mean of the first Gaussian distribution (only Gaussian for
\code{gbayes1PowerNP} and is a required argument)
}
\item{v0}{
variance of the first Gaussian (only Gaussian for
\code{gbayes1PowerNP} and is a required argument)
}
\item{d1}{
mean of the second Gaussian (if \code{mix} < 1)
}
\item{v1}{
variance of the second Gaussian (if \code{mix} < 1).  Any of these last 5
arguments can be omitted to \code{gbayesMixPredNoData} as they can be
provided at run time to the function created by \code{gbayesMixPredNoData}.
}
\item{what}{
specifies whether the predictive density or the CDF is to be
computed.  Default is \code{"density"}.
}
\item{interval}{
a 2-vector containing the lower and upper limit for possible values of
the test statistic \code{x} that would result in a left credible interval
exceeding \code{delta.w} with probability 1-\code{alpha}/2
}
\item{nsim}{
defaults to zero, causing \code{gbayesMixPowerNP} to solve numerically for the
critical value of \code{x}, then to compute the power accordingly.  Specify
a nonzero number such as 20000 for \code{nsim} to instead have the function
estimate power by simulation.  In this case 0.95 confidence limits on
the estimated power are also computed.  This approach is sometimes
necessary if \code{uniroot} can't solve the equation for the critical value.
}}
\value{
\code{gbayes} returns a list of class \code{"gbayes"} containing the following
names elements: \code{mean.prior},\code{var.prior},\code{mean.post}, \code{var.post}, and
if \code{n1} is specified, \code{mean.pred} and \code{var.pred}.  Note that
\code{mean.pred} is  identical to \code{mean.post}.  \code{gbayes2} returns a single
number which is the probability of correctly rejecting the null
hypothesis in favor of the new treatment.  \code{gbayesMixPredNoData}
returns a function that can be used to evaluate the predictive density
or cumulative distribution.  \code{gbayesMixPost} returns a function that
can be used to evaluate the posterior density or cdf.  \code{gbayesMixPowerNP}
returns a vector containing two values if \code{nsim} = 0.  The first value is the
critical value for the test statistic that will make the left credible
interval > \code{delta.w}, and the second value is the power.  If \code{nsim} > 0,
it returns the power estimate and confidence limits for it if \code{nsim} >
0.  The examples show how to use these functions.  
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Spiegelhalter DJ, Freedman LS, Parmar MKB (1994): Bayesian approaches to
randomized trials.  JRSS A 157:357--416.  Results for \code{gbayes} are derived from
Equations 1, 2, 3, and 6.


Spiegelhalter DJ, Freedman LS (1986): A predictive approach to
selecting the size of a clinical trial, based on subjective clinical
opinion.  Stat in Med 5:1--13.


Joseph, Lawrence and Belisle, Patrick (1997): Bayesian sample size
determination for normal means and differences between normal means.
The Statistician 46:209--226.
}
\examples{
# Compare 2 proportions using the var stabilizing transformation
# arcsin(sqrt((x+3/8)/(n+3/4))) (Anscombe), which has variance 
# 1/[4(n+.5)]


m1 <- 100;     m2 <- 150
deaths1 <- 10; deaths2 <- 30


f <- function(events,n) asin(sqrt((events+3/8)/(n+3/4)))
stat <- f(deaths1,m1) - f(deaths2,m2)
var.stat <- function(m1, m2) 1/4/(m1+.5) + 1/4/(m2+.5)
cat("Test statistic:",format(stat),"  s.d.:",
    format(sqrt(var.stat(m1,m2))), "\n")
#Use unbiased prior with variance 1000 (almost flat)
b <- gbayes(0, 1000, m1, m2, stat, var.stat, 2*m1, 2*m2)
print(b)
plot(b)
#To get posterior Prob[parameter > w] use 
# 1-pnorm(w, b$mean.post, sqrt(b$var.post))


#If g(effect, n1, n2) is the power function to
#detect an effect of 'effect' with samples size for groups 1 and 2
#of n1,n2, estimate the expected power by getting 1000 random
#draws from the posterior distribution, computing power for
#each value of the population effect, and averaging the 1000 powers
#This code assumes that g will accept vector-valued 'effect'
#For the 2-sample proportion problem just addressed, 'effect'
#could be taken approximately as the change in the arcsin of
#the square root of the probability of the event


g <- function(effect, n1, n2, alpha=.05) {
  sd <- sqrt(var.stat(n1,n2))
  z <- qnorm(1 - alpha/2)
  effect <- abs(effect)
  1 - pnorm(z - effect/sd) + pnorm(-z - effect/sd)
}


effects <- rnorm(1000, b$mean.post, sqrt(b$var.post))
powers <- g(effects, 500, 500)
hist(powers, nclass=35, xlab='Power')
describe(powers)




# gbayes2 examples
# First consider a study with a binary response where the
# sample size is n1=500 in the new treatment arm and n2=300
# in the control arm.  The parameter of interest is the 
# treated:control log odds ratio, which has variance
# 1/[n1 p1 (1-p1)] + 1/[n2 p2 (1-p2)].  This is not
# really constant so we average the variance over plausible
# values of the probabilities of response p1 and p2.  We
# think that these are between .4 and .6 and we take a 
# further short cut


v <- function(n1, n2, p1, p2) 1/(n1*p1*(1-p1)) + 1/(n2*p2*(1-p2))
n1 <- 500; n2 <- 300
ps <- seq(.4, .6, length=100)
vguess <- quantile(v(n1, n2, ps, ps), .75)
vguess
#        75\% 
# 0.02183459


# The minimally interesting treatment effect is an odds ratio
# of 1.1.  The prior distribution on the log odds ratio is
# a 50:50 mixture of a vague Gaussian (mean 0, sd 100) and
# an informative prior from a previous study (mean 1, sd 1)


prior <- function(delta) 
  0.5*dnorm(delta, 0, 100)+0.5*dnorm(delta, 1, 1)
deltas <- seq(-5, 5, length=150)
plot(deltas, prior(deltas), type='l')


# Now compute the power, averaged over this prior
gbayes2(sqrt(vguess), prior, log(1.1))
# [1] 0.6133338


# See how much power is lost by ignoring the previous
# study completely


gbayes2(sqrt(vguess), function(delta)dnorm(delta, 0, 100), log(1.1))
# [1] 0.4984588


# What happens to the power if we really don't believe the treatment
# is very effective?  Let's use a prior distribution for the log
# odds ratio that is uniform between log(1.2) and log(1.3).
# Also check the power against a true null hypothesis


prior2 <- function(delta) dunif(delta, log(1.2), log(1.3))
gbayes2(sqrt(vguess), prior2, log(1.1))
# [1] 0.1385113


gbayes2(sqrt(vguess), prior2, 0)
# [1] 0.3264065


# Compare this with the power of a two-sample binomial test to
# detect an odds ratio of 1.25
bpower(.5, odds.ratio=1.25, n1=500, n2=300)
#     Power 
# 0.3307486


# For the original prior, consider a new study with equal
# sample sizes n in the two arms.  Solve for n to get a
# power of 0.9.  For the variance of the log odds ratio
# assume a common p in the center of a range of suspected
# probabilities of response, 0.3.  For this example we
# use a zero null value and the uniform prior above


v   <- function(n) 2/(n*.3*.7)
pow <- function(n) gbayes2(sqrt(v(n)), prior2)
uniroot(function(n) pow(n)-0.9, c(50,10000))$root
# [1] 2119.675
# Check this value
pow(2119.675)
# [1] 0.9


# Get the posterior density when there is a mixture of two priors,
# with mixing probability 0.5.  The first prior is almost
# non-informative (normal with mean 0 and variance 10000) and the
# second has mean 2 and variance 0.3.  The test statistic has a value
# of 3 with variance 0.4.
f <- gbayesMixPost(3, 4, mix=0.5, d0=0, v0=10000, d1=2, v1=0.3)


args(f)


# Plot this density
delta <- seq(-2, 6, length=150)
plot(delta, f(delta), type='l')


# Add to the plot the posterior density that used only
# the almost non-informative prior
lines(delta, f(delta, mix=1), lty=2)


# The same but for an observed statistic of zero
lines(delta, f(delta, mix=1, x=0), lty=3)


# Derive the CDF instead of the density
g <- gbayesMixPost(3, 4, mix=0.5, d0=0, v0=10000, d1=2, v1=0.3,
                   what='cdf')
# Had mix=0 or 1, gbayes1PowerNP could have been used instead
# of gbayesMixPowerNP below


# Compute the power to detect an effect of delta=1 if the variance
# of the test statistic is 0.2
gbayesMixPowerNP(g, 1, 0.2, interval=c(-10,12))


# Do the same thing by simulation
gbayesMixPowerNP(g, 1, 0.2, interval=c(-10,12), nsim=20000)


# Compute by what factor the sample size needs to be larger
# (the variance needs to be smaller) so that the power is 0.9
ratios <- seq(1, 4, length=50)
pow <- single(50)
for(i in 1:50) 
  pow[i] <- gbayesMixPowerNP(g, 1, 0.2/ratios[i], interval=c(-10,12))[2]


# Solve for ratio using reverse linear interpolation
approx(pow, ratios, xout=0.9)$y


# Check this by computing power
gbayesMixPowerNP(g, 1, 0.2/2.1, interval=c(-10,12))
# So the study will have to be 2.1 times as large as earlier thought
}
\keyword{htest}
\concept{study design}
\concept{power}


\eof
\name{getHdata}
\alias{getHdata}
\title{Download and Install Datasets for Hmisc, Design, and Statistical
  Modeling} 
\description{
This function downloads and makes ready to use datasets from the main
web site for the Hmisc and Design libraries.  For R, the datasets were
stored in compressed \code{save} format and \code{getHdata} makes them
available by running \code{load()} after download.  For S-Plus, the
datasets were stored in \code{data.dump} format and are made available
by running \code{data.restore()} after import.  The dataset is run through the
\code{cleanup.import} function to reduce multiple inheritance problems
for SV4 (S-Plus 5 or later).  Calling \code{getHdata} with no
\code{file} argument provides a character vector of names of available
datasets that are currently on the web site.  For R, R's default browser
can optionally be launched to view \code{html} files that were already
prepared using the Hmisc command \code{html(contents( ))} or to view
\code{.txt} or \code{.html} data description files when available.
}
\usage{
getHdata(file, what = c("data", "contents", "description", "all"),
         where="http://biostat.mc.vanderbilt.edu/twiki/pub/Main/DataSets")
}
\arguments{
  \item{file}{an unquoted name of a dataset on the web site,
	e.g. \code{prostate}.  Omit \code{file} to obtain a list of
	available datasets.}
  \item{what}{specify \code{what="contents"} to browse the contents
	(metadata) for the dataset rather than fetching the data
	themselves.  Specify \code{what="description"} to browse a data
	description file if available.  Specify \code{what="all"} to
	retrieve the data and see the metadata and description.}
  \item{where}{URL containing the data and metadata files}
}
\details{
For S-Plus, Hmisc defines a function \code{download.file} that is used
by \code{getHdata}.  This is a stripped-down version of the \R
\code{download.file} function that uses the system \code{wget}
executable for fetching files from the Internet.  For Unix and Linux
systems, \code{wget} will be pre-installed usually.  For windows
S-Plus systems, get \code{wget} from
\url{ftp://sunsite.dk/projects/wget/windows}.  Once you unzip the file
from there, move \code{wget.exe} to the same Windows directory that
contains \code{ftp.exe}.
}
\value{
  \code{getHdata()} without a \code{file} argument returns a character
  vector of dataset base names.  When a dataset is downloaded, the data
  frame is placed in search position one and is not returned as value of
  \code{getHdata}.
}
\author{Frank Harrell}
\seealso{\code{\link{download.file}}, \code{\link{cleanup.import}},
  \code{\link{data.restore}}, \code{\link{load}}}

\examples{
\dontrun{
getHdata()          # download list of available datasets
getHdata(prostate)  # downloads, load( ) or data.restore( )
                    # runs cleanup.import for S-Plus 6
getHdata(valung, "contents")   # open browser (options(browser="whatever"))
                    # after downloading valung.html
                    # (result of html(contents()))
getHdata(support, "all")  # download and open one browser window
datadensity(support)
attach(support)     # make individual variables available
getHdata(plasma,  "all")  # download and open two browser windows
                          # (description file is available for plasma)
}
}
\keyword{interface}
\keyword{data}

\eof
\name{hdquantile}
\alias{hdquantile}
\title{Harrell-Davis Distribution-Free Quantile Estimator}
\description{
Computes the Harrell-Davis (1982) quantile estimator and jacknife
standard errors of quantiles.  The quantile estimator is a weighted
linear combination or order statistics in which the order statistics
used in traditional nonparametric quantile estimators are given the
greatest weight.  In small samples the H-D estimator is more efficient
than traditional ones, and the two methods are asymptotically
equivalent.  The H-D estimator is the limit of a bootstrap average as
the number of bootstrap resamples becomes infinitely large.
}
\usage{
hdquantile(x, probs = seq(0, 1, 0.25),
           se = FALSE, na.rm = FALSE, names = TRUE, weights=FALSE)
}
\arguments{
  \item{x}{a numeric vector}
  \item{probs}{vector of quantiles to compute}
  \item{se}{set to \code{TRUE} to also compute standard errors}
  \item{na.rm}{set to \code{TRUE} to remove \code{NA}s from \code{x}
	before computing quantiles}
  \item{names}{set to \code{FALSE} to prevent names attributions from
	being added to quantiles and standard errors}
  \item{weights}{set to \code{TRUE} to return a \code{"weights"}
	attribution with the matrix of weights used in the H-D estimator
	corresponding to order statistics, with columns corresponding to
	quantiles.}
}
\details{
A Fortran routine is used to compute the jackknife leave-out-one
quantile estimates.  Standard errors are not computed for quantiles 0 or
1 (\code{NA}s are returned).
}
\value{
  A vector of quantiles.  If \code{se=TRUE} this vector will have an
  attribute \code{se} added to it, containing the standard errors.  If
  \code{weights=TRUE}, also has a \code{"weights"} attribute which is a matrix.
}
\references{
  Harrell FE, Davis CE (1982): A new distribution-free quantile
  estimator.  Biometrika 69:635-640.

  Hutson AD, Ernst MD (2000): The exact bootstrap mean and variance of
  an L-estimator.  J Roy Statist Soc B 62:89-94.
}
\author{Frank Harrell}
\seealso{\code{\link{quantile}}}
\examples{
set.seed(1)
x <- runif(100)
hdquantile(x, (1:3)/4, se=TRUE)

\dontrun{
# Compare jackknife standard errors with those from the bootstrap
library(boot)
boot(x, function(x,i) hdquantile(x[i], probs=(1:3)/4), R=400)
}
}
\keyword{univar}

\eof
\name{hist.data.frame}
\alias{hist.data.frame}
\title{Histograms for Variables in a Data Frame}
\description{
This functions tries to compute the maximum number of histograms that
will fit on one page, then it draws a matrix of histograms.  If there
are more qualifying variables than will fit on a page, the function
waits for a mouse click before drawing the next page.
}
\usage{
\method{hist}{data.frame}(x, n.unique = 3, nclass = "compute",
                na.big = FALSE, rugs = FALSE, mtitl = FALSE, ...)
# For S-Plus you must use hist.data.frame( ) as hist is not generic there
}
\arguments{
  \item{x}{a data frame}
  \item{n.unique}{minimum number of unique values a variable must have
	before a histogram is drawn}
  \item{nclass}{number of bins.  Default is
	max(2,trunc(min(n/10,25*log(n,10))/2)), where n is the number of
	non-missing values for a variable.}
  \item{na.big}{set to \code{TRUE} to draw the number of missing values
	on the top of the histogram in addition to in a subtitle.  In the
	subtitle, n is the number of non-missing values and m is the number
	of missing values}
  \item{rugs}{set to \code{TRUE} to add rug plots at the top of each histogram}
  \item{mtitl}{set to a character string to set aside extra outside top
	margin and to use the string for an overall title}
  \item{\dots}{arguments passed to \code{scat1d}}
}
\value{the number of pages drawn}
\author{Frank E Harrell Jr}
\seealso{\code{\link{hist}}, \code{\link{scat1d}}}
\examples{
d <- data.frame(a=runif(200), b=rnorm(200),
                w=factor(sample(c('green','red','blue'), 200, TRUE)))
hist.data.frame(d)   # in R, just say hist(d)
}
\keyword{hplot}
\keyword{dplot}
\keyword{distribution}

\eof
\name{histbackback}
\alias{histbackback}
\title{
Back to Back Histograms
}
\description{
Takes two vectors or a list with \code{x} and \code{y} components, and produces 
back to back histograms of the two datasets.
}
\usage{
histbackback(x, y, brks=NULL, xlab=NULL, axes=TRUE, probability=FALSE,
             xlim=NULL, ylab='', \dots)
}
\arguments{
\item{x,y}{
either two vectors or a list given as \code{x} with two components.  If the
components have names, they will be used to label the axis
(modification FEH).
}
\item{brks}{
vector of the desired breakpoints for the histograms.
}
\item{xlab}{
a vector of two character strings naming the two datasets.
}
\item{axes}{
logical flag stating whether or not to label the axes.
}
\item{probability}{
logical flag: if \code{TRUE}, then the x-axis corresponds to the units for a
density.  If \code{FALSE}, then the units are counts.
}
\item{xlim}{
x-axis limits.  First value must be negative, as the left histogram is
placed at negative x-values.  Second value must be positive, for the
right histogram.  To make the limits symmetric, use e.g. \code{ylim=c(-20,20)}.
}
\item{ylab}{
label for y-axis.  Default is no label.
}
\item{...}{
additional graphics parameters may be given.
}}
\value{
a list is returned invisibly with the following components:

\item{left}{
the counts for the dataset plotted on the left.
}
\item{right}{
the counts for the dataset plotted on the right.
}
\item{breaks}{
the breakpoints used.
}}
\section{Side Effects}{
a plot is produced on the current graphics device.
}
\author{
Pat Burns
\cr
Salomon Smith Barney
\cr
London
\cr
pburns@dorado.sbi.com
}
\seealso{
\code{\link{hist}}, \code{\link{histogram}}
}
\examples{
options(digits=3)
set.seed(1)
histbackback(rnorm(20), rnorm(30))


fool <- list(x=rnorm(40), y=rnorm(40))
histbackback(fool)
age <- rnorm(1000,50,10)
sex <- sample(c('female','male'),1000,TRUE)
histbackback(split(age, sex))
agef <- age[sex=='female']; agem <- age[sex=='male']
histbackback(list(Female=agef,Male=agem), probability=TRUE, xlim=c(-.06,.06))
}
\keyword{dplot}
\keyword{hplot}
\keyword{distribution}
% Converted by Sd2Rd version 1.21.

\eof
\name{hoeffd}
\alias{hoeffd}
\alias{print.hoeffd}
\title{
Matrix of Hoeffding's D Statistics
}
\description{
Computes a matrix of Hoeffding's (1948) \code{D} statistics for all possible
pairs of columns of a matrix.  \code{D}
is a measure of the distance
between \code{F(x,y)} and \code{G(x)H(y)}, where \code{F(x,y)} is the joint CDF of \code{X} and \code{Y},
and \code{G} and \code{H} are marginal CDFs. Missing values are deleted in pairs rather than deleting all rows
of \code{x} having any missing variables.
The \code{D} statistic is robust against a wide
variety of alternatives to independence, such as non-monotonic relationships.
The larger the value of \code{D}, the more dependent are \code{X} and \code{Y} (for many types
of dependencies).  \code{D} used here is 30 times Hoeffding's original \code{D}, and
ranges from -0.5 to 1.0 if there are no ties in the data.
\code{print.hoeffd} prints the information derived by \code{hoeffd}.  The higher
the value of \code{D}, the more dependent are \code{x} and \code{y}.
}
\synopsis{hoeffd(x, y)}
\usage{
hoeffd(x)
hoeffd(x, y)
\method{print}{hoeffd}(x, \dots)
}
\arguments{
\item{x}{
a numeric matrix with at least 5 rows and at least 2 columns (if
\code{y} is absent), or an object created by \code{hoeffd}
}
\item{y}{
a numeric vector or matrix which will be concatenated to \code{x}
}
\item{\dots}{ignored}
}
\value{
a list with elements \code{D}, the
matrix of D statistics, \code{n} the
matrix of number of observations used in analyzing each pair of variables,
and \code{P}, the asymptotic P-values.
Pairs with fewer than 5 non-missing values have the D statistic set to NA.
The diagonals of \code{n} are the number of non-NAs for the single variable
corresponding to that row and column.
}
\details{
Uses midranks in case of ties, as described by Hollander and Wolfe.
P-values are approximated by linear interpolation on the table
in Hollander and Wolfe, which uses the asymptotically equivalent
Blum-Kiefer-Rosenblatt statistic.  For \code{P<.0001} or \code{>0.5}, \code{P} values are
computed using a well-fitting linear regression function in \code{log P} vs.
the test statistic.
Ranks (but not bivariate ranks) are computed using efficient
algorithms (see reference 3).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\references{
Hoeffding W. (1948): A non-parametric test of independence.  Ann Math Stat
19:546--57.


Hollander M. and Wolfe D.A. (1973).  Nonparametric Statistical Methods,
pp. 228--235, 423. New York: Wiley.


Press WH, Flannery BP, Teukolsky SA, Vetterling, WT (1988): Numerical
Recipes in C.  Cambridge: Cambridge University Press.
}
\seealso{
\code{\link{rcorr}}, \code{\link{varclus}}
}
\examples{
x <- c(-2, -1, 0, 1, 2)
y <- c(4,   1, 0, 1, 4)
z <- c(1,   2, 3, 4, NA)
q <- c(1,   2, 3, 4, 5)
hoeffd(cbind(x,y,z,q))


# Hoeffding's test can detect even one-to-many dependency
set.seed(1)
x <- seq(-10,10,length=200)
y <- x*sign(runif(200,-1,1))
plot(x,y)
hoeffd(x,y)
}
\keyword{nonparametric}
\keyword{htest}
% Converted by Sd2Rd version 1.21.

\eof
\name{html}
\alias{html}
\alias{html.latex}
\alias{html.data.frame}
\alias{html.default}
\alias{show.html}
\alias{print.html}
\title{
Convert an S object to HTML
}
\description{
\code{html} is a generic function, for which only two methods are currently
implemented, \code{html.latex} and a rudimentary
\code{html.data.frame}.  The former uses the \code{HeVeA} LaTeX to HTML 
translator by Maranget to create an HTML file from a LaTeX file like
the one produced by \code{latex}.  The resulting HTML file may be
displayed using a \code{show} or a \code{print} method.  The browser
specified in \code{options(browser=)} for R (\code{help.browser} for
S-Plus) is launched to display the HTML file.   \code{html.default} just
runs \code{html.data.frame}.
}
\usage{
html(object, \dots)
\method{html}{latex}(object, ...)
\method{html}{data.frame}(object,
  file=paste(first.word(deparse(substitute(object))),'html',sep='.'),
     append=FALSE, link=NULL, linkCol=1, linkType=c('href','name'), \dots)
\method{html}{default}(object,
     file=paste(first.word(deparse(substitute(object))),'html',sep='.'),
     append=FALSE, link=NULL, linkCol=1, linkType=c('href','name'), \dots)
\method{print}{html}(x, ...)
\method{show}{html}(object)
}
\arguments{
\item{object}{a data frame or an object created by \code{latex}.  For
  \code{show} is an object created by \code{html}.  For the generic
  \code{html} is any object for which an \code{html} method exists.}
\item{file}{
name of the file to create.  The default file
name is \code{object.html} where \code{object} is the first word in
the name of the argument for \code{object}.
}
\item{append}{set to \code{TRUE} to append to an existing file}
\item{link}{character vector specifying hyperlink names to attach to
  selected elements of the matrix or data frame.  No hyperlinks are used
  if \code{link} is omitted or for elements of \code{link} that are
  \code{""}.}
\item{linkCol}{column number of \code{object} to which hyperlinks are
  attached.  Defaults to first column.}
\item{linkType}{defaults to \code{"href"}}
\item{...}{arguments passed to \code{format.df}}
\item{x}{an object created by \code{html}}
}
\section{Side Effects}{
\code{print} or \code{show} launch a browser
}
\author{
Frank E. Harrell, Jr.
\cr
Department of Biostatistics,
\cr
Vanderbilt University,
\cr
\email{f.harrell@vanderbilt.edu}
}
\references{
Maranget, Luc.  HeVeA: a LaTeX to HTML translater.
URL: http://para.inria.fr/~maranget/hevea/
}
\seealso{
\code{\link{latex}}
}
\examples{
\dontrun{
x <- matrix(1:6, nrow=2, dimnames=list(c('a','b'),c('c','d','e')))
w <- latex(x)
h <- html(w) # run HeVeA to convert .tex to .html
h <- html(x) # convert x directly to html
options(browser='konqueror')  # use help.browser for S-Plus
h            # launch html browser by running print.html
w <- html(x, link=c('','B'))   # hyperlink first row first col to B
}
}
\keyword{utilities}
\keyword{interface}
\keyword{methods}
\keyword{file}
\keyword{character}
\keyword{manip}

\eof
\name{impute}
\alias{impute}
\alias{impute.default}
\alias{print.impute}
\alias{summary.impute}
\alias{[.impute}
\alias{is.imputed}
\title{
Generic Functions and Methods for Imputation
}
\description{
These functions do simple and \code{transcan} 
imputation and print, summarize, and subscript
variables that have NAs filled-in with imputed values.  The simple
imputation method involves filling in NAs with constants,
with a specified single-valued function of the non-NAs, or from
a sample (with replacement) from the non-NA values (this is useful
in multiple imputation).
More complex imputations can be done
with the \code{transcan} function, which also works with the generic methods
shown here, i.e., \code{impute} can take a \code{transcan} object and use  the
imputed values created by \code{transcan} (with \code{imputed=TRUE})  to fill-in NAs.
The \code{print} method places * after variable values that were imputed.
The \code{summary} method summarizes all imputed values and then uses
the next \code{summary} method available for the variable.
The subscript method preserves attributes of the variable and subsets
the list of imputed values corresponding with how the variable was
subsetted.  The \code{is.imputed} function is for checking if observations
are imputed.
}
\usage{
impute(x, ...)

\method{impute}{default}(x, fun=median, ...)

\method{print}{impute}(x, ...)

\method{summary}{impute}(object, ...)

is.imputed(x)
}
\arguments{
\item{x}{
a vector or an object created by \code{transcan}, or a vector needing
basic unconditional imputation.  If there are no \code{NA}s and \code{x}
is a vector, it is returned unchanged.
}
\item{fun}{
the name of a function to use in computing the (single) 
imputed value from the non-NAs.  The default is \code{median}.
If instead of specifying a function as \code{fun}, a single value or vector
(numeric, or character if \code{object} is a factor) is specified,
those values are used for insertion.  \code{fun} can also be the character
string \code{"random"} to draw random values for imputation, with the random
values not forced to be the same if there are multiple NAs.
For a vector of constants, the vector must be of length one
(indicating the same value replaces all NAs) or must be as long as
the number of NAs, in which case the values correspond to consecutive NAs
to replace.  For a factor \code{object}, constants for imputation may include
character values not in the current levels of \code{object}.  In that
case new levels are added.
If \code{object} is of class \code{"factor"}, \code{fun} is ignored and the
most frequent category is used for imputation.
}
\item{object}{an object of class \code{"impute"}}
\item{...}{ignored}
}
\value{
a vector with class \code{"impute"} placed in front of existing classes.
For \code{is.imputed}, a vector of logical values is returned (all
\code{TRUE} if \code{object} is not of class \code{impute}).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\seealso{
\code{\link{transcan}}, \code{\link{impute.transcan}}, \code{\link{describe}}, \code{\link{na.include}}, \code{\link{sample}}
}
\examples{
age <- c(1,2,NA,4)
age.i <- impute(age)
# Could have used impute(age,2.5), impute(age,mean), impute(age,"random")
age.i
summary(age.i)
is.imputed(age.i)
}
\keyword{methods}
\keyword{math}
\keyword{htest}
\keyword{models}

\eof
\name{interaction}
\alias{interaction}
\title{
Compute the Interaction of Several Factors
}
\description{
This is a replacement for the builtin function \code{interaction} that adds
the parameters \code{sep} and \code{left}.
}
\usage{
interaction(\dots, drop=FALSE, sep=".", left=FALSE)
}
\arguments{
\item{...}{
the arguments to \code{interaction} can be either a data frame containing
all the factors to be used
all the individual factors.
It will not understand a combination of factors and designs as arguments;
you have to pick one form or the other.
}
\item{drop}{
if \code{TRUE} the levels of the new factor not represented in the data are dropped.
}
\item{sep}{
the separator in creating the level descriptors.
}
\item{left}{
set to \code{TRUE} to left-justify factor levels when constructing label strings
}}
\value{
a new factor, whose levels are all possible combinations of the factors
supplied as arguments. If \code{drop =TRUE}, only the levels represented in the
new factor are retained.  
}
\details{
This is a slight modification of the S-supplied function.  The \code{sep} argument
has been added, a sort step has been added to properly sort the levels of
the created variable, and the \code{left} parameter is added.
}
\examples{
Temp <- c(160, 180, 160, 180, 160, 180, 160, 180)
Conc <- c(20, 20, 40, 40, 20, 20, 40, 40)
interaction(Temp, Conc)
}
\keyword{design}
% Converted by Sd2Rd version 1.21.

\eof
\name{labcurve}
\alias{labcurve}
\alias{putKey}
\alias{putKeyEmpty}
\alias{largest.empty}
\alias{drawPlot}
\alias{plot.drawPlot}
\alias{bezier}
\title{
Label Curves, Make Keys, and Interactively Draw Points and Curves
}
\description{
\code{labcurve} Optionally draws a set of curves then labels the curves.
A variety of methods for drawing labels are implemented, ranging
from positioning using the mouse to automatic labeling to automatic
placement of key symbols with manual placement of key legends to
automatic placement of legends.  For automatic positioning of labels
or keys, a curve
is labeled at a point that is maximally separated from all of the
other curves.  Gaps occurring when curves do not start or end at the
same x-coordinates are given preference for positioning labels. If 
labels are offset from the curves (the default behaviour), if the
closest curve to curve i is above curve i, curve i is labeled below
its line.  If the closest curve is below curve i, curve i is labeled
above its line.  These directions are reversed if the resulting labels
would appear outside the plot region.

Both ordinary lines and step functions are handled, and there is an
option to draw the labels at the same angle as the curve within a
local window.

Unless the mouse is used to position labels or plotting symbols are placed
along the curves to distinguish them, curves are examined at 100
(by default) equally spaced points over the range of x-coordinates in the current
plot area.  Linear interpolation is used to get y-coordinates to line
up (step function or constant interpolation is used for step
functions).  There is an option to instead examine all curves at the
set of unique x-coordinates found by unioning the x-coordinates of all
the curves.  This option is especially useful when plotting step
functions.  By setting \code{adj="auto"} you can have \code{labcurve} try to
optimally left- or right-justify labels depending on the slope of the
curves at the points at which labels would be centered (plus a vertical
offset).  This is especially useful when labels must be placed on steep
curve sections.

You can use the \code{on top} method to write (short) curve names directly
on the curves (centered on the y-coordinate).  This is especially
useful when there are many curves whose full labels would run into
each other.  You can plot letters or numbers on the curves, for
example (using the \code{keys} option), and have \code{labcurve} use the \code{key} function to
provide long labels for these short ones (see the end of the example).
There is another option for connecting labels to curves using arrows.
When \code{keys} is a vector of integers, it is taken to represent plotting
symbols (\code{pch}s), and these symbols are plotted at equally-spaced
x-coordinates on each curve (by default, using 5 points per curve).
The points are offset in the x-direction between curves so as to minimize the chance of collisions.

To add a legend defining line types, colors, or line widths with no symbols, 
specify \code{keys="lines"},
e.g., \code{labcurve(curves, keys="lines", lty=1:2)}.

\code{putKey} provides a different way to use \code{key()} by allowing
the user to specify vectors for labels, line types, plotting characters,
etc.  Elements that do not apply (e.g., \code{pch} for lines
(\code{type="l"})) may be \code{NA}.  When a series of points is
represented by both a symbol and a line, the corresponding elements of
both \code{pch} and \code{lty}, \code{col.}, or \code{lwd} will be
non-missing.

\code{putKeyEmpty}, given vectors of all the x-y coordinates that have been
plotted, uses \code{largest.empty} to find the largest empty rectangle large
enough to hold the key, and draws the key using \code{putKey}.

\code{drawPlot} is a simple mouse-driven function for drawing series of
lines, step functions, polynomials, Bezier curves, and points, and
automatically labeling the point groups using \code{labcurve} or
\code{putKeyEmpty}.  When \code{drawPlot} is invoked it creates
temporary functions \code{Points}, \code{Curve}, and \code{Abline} in
the session frame (frame zero).  The user calls these functions inside
the call to \code{drawPlot} to define groups of points in the order they
are defined with the mouse.  \code{Abline} is used to call \code{abline}
and not actually great a group of points. For some curve types, the
curve generated to represent the corresponding series of points is drawn
after all points are entered for that series, and this curve may be
different than the simple curve obtained by connecting points at the
mouse clicks.  For example, to draw a general smooth Bezier curve the
user need only click on a few points, and she must overshoot the final
curve coordinates to define the curve.  The originally entered points
are not erased once the curve is drawn.  The same goes for step
functions and polynomials.  If you \code{plot()} the object returned by
\code{drawPlot}, however, only final curves will be shown.  The last
examples show how to use \code{drawPlot}.

The \code{largest.empty} function finds the largest rectangle that is large
enough to hold a rectangle of a given height and width, such that the
rectangle does not contain any of a given set of points.  This is
used by \code{labcurve} and \code{putKeyEmpty} to position keys at the most
empty part of an existing plot.
}
\usage{
labcurve(curves, labels=names(curves),
         method=NULL, keys=NULL, keyloc=c("auto","none"),
         type="l", step.type=c("left", "right"), 
         xmethod=if(any(type=="s")) "unique" else "grid", 
         offset=NULL, xlim=NULL,
         tilt=FALSE, window=NULL, npts=100, cex=NULL, 
         adj="auto", angle.adj.auto=30,
         lty=pr$lty, lwd=pr$lwd, col.=pr$col, transparent=TRUE,
         arrow.factor=1, point.inc=NULL, opts=NULL, key.opts=NULL,
         empty.method=c('area','maxdim'), numbins=25, 
         pl=!missing(add), add=FALSE, 
         ylim=NULL, xlab="", ylab="",
         whichLabel=1:length(curves),
         grid=FALSE, xrestrict=NULL, \dots)

putKey(z, labels, type, pch, lty, lwd,
       cex=par('cex'), col=rep(par('col'),nc),
       transparent=TRUE, plot=TRUE, key.opts=NULL, grid=FALSE)

putKeyEmpty(x, y, labels, type=NULL,
            pch=NULL, lty=NULL, lwd=NULL,
            cex=par('cex'), col=rep(par('col'),nc),
            transparent=TRUE, plot=TRUE, key.opts=NULL,
            empty.method=c('area','maxdim'), 
            numbins=25, 
            xlim=pr$usr[1:2], ylim=pr$usr[3:4], grid=FALSE)

drawPlot(\dots, xlim=c(0,1), ylim=c(0,1), xlab='', ylab='',
         ticks=c('none','x','y','xy'),
         key=FALSE, opts=NULL)

# Points(label=' ', type=c('p','r'),
#        n, pch=pch.to.use[1], cex=par('cex'),
#        rug = c('none','x','y','xy'), ymean)

# Curve(label=' ',
#       type=c('bezier','polygon','linear','pol','step','gauss'),
#       n=NULL, lty=1, lwd=par('lwd'), degree=2,
#      evaluation=100, ask=FALSE)

# Abline(\dots)

\method{plot}{drawPlot}(x, file, xlab, ylab, ticks,
     key=x$key, keyloc=x$keyloc, \dots)

largest.empty(x, y, width, height, 
              numbins=25, method=c('area','maxdim'),
              xlim=pr$usr[1:2], ylim=pr$usr[3:4],
              pl=FALSE, grid=FALSE)
}
\arguments{
\item{curves}{
a list of lists, each of which have at least two components: a vector of \code{x}
values and a vector of corresponding \code{y} values.  \code{curves} is
mandatory except when \code{method="mouse"} or \code{"locator"}, in which 
case \code{labels} is mandatory.  Each list in \code{curves} may optionally have
any of the parameters \code{type}, \code{lty}, \code{lwd}, or \code{col} for that curve,
as defined below (see one of the last examples).
}
\item{z}{
a two-element list specifying the coordinate of the center of the key,
e.g. \code{locator(1)} to use the mouse for positioning
}
\item{labels}{
For \code{labcurve}, a vector of character strings used to label curves 
(which may contain newline characters to stack labels vertically).  The
default labels are taken from the names of the \code{curves} list.
Setting \code{labels=FALSE} will suppress drawing any labels (for
\code{labcurve} only). 
For \code{putKey} and \code{putKeyEmpty} is a vector of character strings
specifying group labels
}
\item{x}{
}
\item{y}{
for \code{putKeyEmpty} and \code{largest.empty}, \code{x} and \code{y} are same-length
vectors specifying points that have been plotted.  \code{x} can also be
an object created by \code{drawPlot}.
}
\item{\dots}{
For \code{drawPlot} is a series of invocations of \code{Points} and
\code{Curve} (see example).  Any number of point groups can be defined
in this way.  For \code{Abline} these may be any arguments to
\code{abline}. 
For \code{labcurve}, other parameters to pass to \code{text}.  For
\code{plot.drawPlot} other parameters to pass to \code{setps}.
}
\item{width}{
}
\item{height}{
for \code{largest.empty}, specifies the minimum allowable width in \code{x} units and
the minimum allowable height in \code{y} units
}
\item{method}{
\code{"offset"} (the default) offsets labels at largest gaps between
curves, and draws labels beside curves.  
\code{"on top"} draws labels on top of the curves (especially
good when using keys).  
\code{"arrow"} draws arrows connecting labels to the curves.
\code{"mouse"} or \code{"locator"} positions labels according to mouse clicks.
If \code{keys} is specified and is an integer vector or is \code{"lines"}, 
\code{method} defaults to \code{"on top"}.  If \code{keys} is character,
\code{method} defaults to \code{"offset"}.  Set \code{method="none"} to
suppress all curve labeling and key drawing, which is useful when
\code{pl=TRUE} and you only need \code{labcurve} to draw the curves and the
rest of the basic graph.

For \code{largest.empty} specifies the method determining the best rectangle
among all those that qualify with respect to \code{width} and \code{height}.
Use \code{method="area"} (the default) to find the one having the largest area,
or \code{method="maxdim"} to use the last rectangle searched that had both
the largest width and largest height over all previous rectangles.
}
\item{keys}{
This causes keys (symbols or short text) to be drawn on or beside
curves, and if \code{keyloc} is not equal to \code{"none"}, a legend to be
automatically drawn.  The legend links keys with full curve labels
and optionally with colors and line types.
Set \code{keys} to a vector of character strings, or a
vector of integers specifying plotting character (\code{pch} values -
see \code{points}).  For the latter case, the default behavior is to
plot the symbols periodically, at equally spaced x-coordinates.
}
\item{keyloc}{
When \code{keys} is specified, \code{keyloc} specifies how the legend
is to be positioned for drawing using the \code{key} function in
\code{trellis}.  The default is \code{"auto"}, for which the
\code{largest.empty} function to used to find the most empty part of the
plot.  If no empty rectangle large enough to hold the key is found, no
key will be drawn. Specify \code{keyloc="none"} to suppress drawing a
legend, or set \code{keyloc} to a 2-element list containing the x and y
coordinates for the center of the legend.  For example, use
\code{keyloc=locator(1)} to click the mouse at the center.
\code{keyloc} specifies the coordinates of the center of the
key to be drawn with \code{plot.drawPlot} when \code{key=TRUE}.
}
\item{type}{
for \code{labcurve}, a scalar or vector of character strings specifying the
method that the points in the curves were connected. \code{"l"} means
ordinary connections between points and \code{"s"} means step functions.
For \code{putKey} and \code{putKeyEmpty} is a vector of plotting types, \code{"l"}
for regular line, \code{"p"} for point, \code{"b"} for both point and line, and
\code{"n"} for none.  For \code{Points} is either \code{"p"} (the default) for
regular points, or \code{"r"} for rugplot (one-dimensional scatter diagram
to be drawn using the \code{scat1d} function).  For \code{Curve}, \code{type} is
\code{"bezier"} (the default) for drawing a smooth Bezier curves (which can
represent a non-1-to-1 function such as a circle), \code{"polygon"} for
orginary line segments, \code{"linear"} for a straight line defined by two
endpoints, \code{"pol"} for a \code{degree}-degree polynomial to be fitted to
the mouse-clicked points, \code{"step"} for a left-step-function, \code{"gauss"}
to plot a Gaussian density fitted to 3 clicked points, or a function
to draw a user-specified function, evaluated at \code{evaluation} points
spanning the whole x-axis.  For the density the user must click in the
left tail, at the highest value (at the mean), and in the right tail,
with the two tail values being approximately equidistant from the
mean.  The density is scaled to fit in the highest value regardless of
its area.
}
\item{step.type}{
type of step functions used (default is \code{"left"})
}
\item{xmethod}{
method for generating the unique set of x-coordinates to examine (see above).  Default is \code{"grid"} for \code{type="l"} or \code{"unique"} for 
\code{type="s"}.
}
\item{offset}{
distance in y-units between the center of the label and the line being
labeled.  Default is 0.75 times the height of an "m" that would be
drawn in a label.  For R grid/lattice you must specify offset using
the \code{grid} \code{unit} function, e.g., \code{offset=unit(2,"native")} or
\code{offset=unit(.25,"cm")} (\code{"native"} means data units)
}
\item{xlim}{
limits for searching for label positions, and is also used to set up
plots when \code{pl=TRUE} and \code{add=FALSE}.  Default is total x-axis
range for current plot (\code{par("usr")[1:2]}).  For
\code{largest.empty}, \code{xlim} limits the search for largest
rectanges, but it has the same default as above. For
\code{pl=TRUE,add=FALSE} you may want to extend \code{xlim} somewhat to
allow large keys to fit, when using \code{keyloc="auto"}.  For
\code{drawPlot} default is \code{c(0,1)}.
}
\item{tilt}{
set to \code{TRUE} to tilt labels to follow the curves, for \code{method="offset"}
when \code{keys} is not given.
}
\item{window}{
width of a window, in x-units, to use in determining the local slope
for tilting labels.  Default is 0.5 times number of characters in the
label times the x-width of an "m" in the current character size and font.
}
\item{npts}{
number of points to use if \code{xmethod="grid"}
}
\item{cex}{
character size to pass to \code{text} and \code{key}.  Default is current
\code{par("cex")}.  For \code{putKey}, \code{putKeyEmpty}, and \code{Points} is the size of the
plotting symbol.
}
\item{adj}{
Default is \code{"auto"} which has \code{labcurve} figure justification
automatically when \code{method="offset"}.  This will cause centering to be used when the local angle
of the curve is less than \code{angle.adj.auto} in absolute value, left
justification if the angle is larger and either the label is under a
curve of positive slope or over a curve of negative slope, and right
justification otherwise.  For step functions, left justification is used
when the label is above the curve and right justifcation otherwise.
Set \code{adj=.5} to center labels at computed coordinates.  Set to 0 for
left-justification, 1 for right.  Set \code{adj} to a vector to vary adjustments
over the curves.
}
\item{angle.adj.auto}{
see \code{adj}.  Does not apply to step functions.
}
\item{lty}{
vector of line types which were used to draw the curves.
This is only used when keys are drawn. If all of the
line types, line widths, and line colors are the same, 
lines are not drawn in the key.
}
\item{lwd}{
vector of line widths which were used to draw the curves.
This is only used when keys are drawn.  See \code{lty} also.
}
\item{col.}{
}
\item{col}{
vector of integer color numbers for use in curve labels, symbols,
lines, and legends.  Default is \code{par("col")} for all curves.
See \code{lty} also.
}
\item{transparent}{
Default is \code{TRUE} to make \code{key} draw transparent legends, i.e., to
suppress drawing a solid rectangle background for the legend.
Set to \code{FALSE} otherwise.
}
\item{arrow.factor}{
factor by which to multiply default arrow lengths
}
\item{point.inc}{
When \code{keys} is a vector of integers, \code{point.inc} specifies the x-increment
between the point symbols that are overlaid periodically on the curves.  
By default, \code{point.inc} is equal
to the range for the x-axis divided by 5.
}
\item{opts}{
an optional list which can be used to specify any of the options
to \code{labcurve}, with the usual element name abbreviations allowed.
This is useful when \code{labcurve} is being called from another
function.  Example: \code{opts=list(method="arrow", cex=.8, np=200)}.
For \code{drawPlot} a list of \code{labcurve} options to pass as
\code{labcurve(\dots, opts=)}.
}
\item{key.opts}{
a list of extra arguments you wish to pass to \code{key()}, e.g.,
\code{key.opts=list(background=1, between=3)}.  
The argument names must be spelled out in full.
}
\item{empty.method}{
}
\item{numbins}{
These two arguments are passed to the \code{largest.empty} function's
\code{method} and \code{numbins} arguments (see below).
For \code{largest.empty} specifies the number of bins in which to
discretize both the \code{x} and \code{y} directions for searching for
rectangles.  Default is 25.
}
\item{pl}{
set to \code{TRUE} (or specify \code{add}) to cause the curves in \code{curves} to be
drawn, under the control of \code{type},\code{lty},\code{lwd},\code{col} parameters defined
either in the \code{curves} lists or in the separate arguments given to
\code{labcurve} or through \code{opts}.
For \code{largest.empty}, set \code{pl=TRUE} to show the rectangle the function 
found by drawing it with a solid color.
}
\item{add}{
By default, when curves are actually drawn by \code{labcurve} a new plot is
started.  To add to an existing plot, set \code{add=TRUE}.
}
\item{ylim}{
When a plot has already been started, \code{ylim} defaults to \code{par("usr")[3:4]}.
When \code{pl=TRUE}, \code{ylim} and \code{xlim} are determined from the ranges of the data.
Specify \code{ylim} yourself to take control of the plot construction.  
In some cases it is advisable to
make \code{ylim} larger than usual to allow for automatically-positioned keys.
For \code{largest.empty}, \code{ylim} specifies the limits on the y-axis to limit
the search for rectangle.  
Here \code{ylim} defaults to the same as above, i.e., the range
of the y-axis of an open plot from \code{par}.  For \code{drawPlot} the default
is \code{c(0,1)}.
}
\item{xlab}{
}
\item{ylab}{
x-axis and y-axis labels when \code{pl=TRUE} and \code{add=FALSE} or for
\code{drawPlot}.
Defaults to \code{""} unless the first curve has names for its first two
elements, in which case the names of these elements are taken as
\code{xlab} and \code{ylab}.
}
\item{whichLabel}{
  integer vector corresponding to \code{curves} specifying which curves
  are to be labelled or have a legend
  }
\item{grid}{
set to \code{TRUE} if the R \code{grid} package was used to draw the
current plot.  This prevents \code{labcurve} from using
\code{par("usr")} etc.  If using R \code{grid} you can pass coordinates
and lengths having arbitrary units, as documented in the \code{unit}
function.  This is especially useful for \code{offset}.
}
\item{xrestrict}{
  When having \code{labcurve} label curves where they are most
  separated, you can restrict the search for this separation point to a
  range of the x-axis, specified as a 2-vector \code{xrestrict}.  This
  is useful when one part of the curve is very steep.  Even though
  steep regions may have maximum separation, the labels will collide
  when curves are steep.
  }
\item{pch}{
vector of plotting characters for \code{putKey} and \code{putKeyEmpty}.  Can be
any value including \code{NA} when only a line is used to indentify the
group.  Is a single plotting character for \code{Points}, with the default
being the next unused value from among 1, 2, 3, 4, 16, 17, 5, 6, 15,
18, 19.
}
\item{file}{
a file name suffix.  If specified, \code{plot.drawPlot} will send its
output to a postscript file "file.ps" using the \code{setps} function to
get nice defaults for inclusion in reports.
}
\item{plot}{
set to \code{FALSE} to keep \code{putKey} or \code{putKeyEmpty} from actually drawing the
key.  Instead, the size of the key will be return by \code{putKey}, or the
coordinates of the key by \code{putKeyEmpty}.
}
\item{ticks}{
tells \code{drawPlot} which axes to draw tick marks and tick labels.
Default is \code{"none"}.
}
\item{key}{
for \code{drawPlot} and \code{plot.drawPlot}.  Default is \code{FALSE} so that \code{labcurve}
is used to label points or curves.  Set to \code{TRUE} to use
\code{putKeyEmpty}.}
}
\value{
\code{labcurve} returns an invisible list with components \code{x, y, offset, adj, cex, col}, and if \code{tilt=TRUE}, 
\code{angle}. \code{offset} is the amount to add to \code{y} to draw a label.
\code{offset} is negative if the label is drawn below the line.
\code{adj} is a vector containing the values 0, .5, 1.


\code{largest.empty} returns a list with elements \code{x} and \code{y} specifying the
coordinates of the center of the rectangle which was found.
}
\details{
  The internal functions \code{Points}, \code{Curve}, \code{Abline} have
  unique arguments as follows.
  \describe{
	\item{\code{label}:}{for \code{Points} and \code{Curve} is a single
	  character string to label that group of points}
	\item{\code{n}:}{number of points to accept from the mouse.  Default
	  is to input points until a right mouse click.}
	\item{\code{rug}:}{for \code{Points}.  Default is \code{"none"} to
	  not show the  marginal x or y distributions as rug plots, for the
	  points entered. Other possibilities are used to execute
	  \code{scat1d} to show the marginal distribution of x, y, or both
	  as rug plots.} 
	\item{\code{ymean}:}{for \code{Points}, subtracts a constant from
	  each y-coordinate entered to make the overall mean \code{ymean}}
	\item{\code{degree}:}{degree of polynomial to fit to points by
	  \code{Curve}} 
	\item{\code{evaluation}:}{number of points at which to evaluate
	  Bezier curves, polynomials, and other functions in \code{Curve}}
	\item{\code{ask}:}{set \code{ask=TRUE} to give the user the
	  opportunity to try again at specifying points for Bezier curves,
	  step functions, and polynomials}
  }
  
The \code{labcurve} function used some code from the function \code{plot.multicurve} written
by Rod Tjoelker of The Boeing Company (\code{tjoelker@espresso.rt.cs.boeing.com}).

If there is only one curve, a label is placed at the middle x-value,
and no fancy features such as \code{angle} or positive/negative offsets are
used.

\code{key} is called once (with the argument \code{plot=FALSE}) to find the key
dimensions.  Then an empty rectangle with at least these dimensions is
searched for using \code{largest.empty}.  Then \code{key} is called again to draw
the key there, using the argument \code{corner=c(.5,.5)} so that the center
of the rectangle can be specified to \code{key}.

If you want to plot the data, an easier way to use \code{labcurve} is
through \code{xYplot} as shown in some of its examples.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{approx}}, \code{\link{text}}, \code{\link{legend}}, \code{\link{setps}}, \code{\link{scat1d}}, \code{\link{xYplot}}, \code{\link{abline}}
}
\examples{
n <- 2:8
m <-  length(n)
type <- c('l','l','l','l','s','l','l')
# s=step function l=ordinary line (polygon)
curves <- vector('list', m)


plot(0,1,xlim=c(0,1),ylim=c(-2.5,4),type='n')


set.seed(39)


for(i in 1:m) {
  x <- sort(runif(n[i]))
  y <- rnorm(n[i])
  lines(x, y, lty=i, type=type[i], col=i)
  curves[[i]] <- list(x=x,y=y)
}


labels <- paste('Label for',letters[1:m])
labcurve(curves, labels, tilt=TRUE, type=type, col=1:m)


# Put only single letters on curves at points of 
# maximum space, and use key() to define the letters,
# with automatic positioning of the key in the most empty
# part of the plot
# Have labcurve do the plotting, leaving extra space for key


names(curves) <- labels
labcurve(curves, keys=letters[1:m], type=type, col=1:m,
         pl=TRUE, ylim=c(-2.5,4))


# Put plotting symbols at equally-spaced points,
# with a key for the symbols, ignoring line types


labcurve(curves, keys=1:m, lty=1, type=type, col=1:m,
         pl=TRUE, ylim=c(-2.5,4))




# Plot and label two curves, with line parameters specified with data
set.seed(191)
ages.f <- sort(rnorm(50,20,7))
ages.m <- sort(rnorm(40,19,7))
height.f <- pmin(ages.f,21)*.2+60
height.m <- pmin(ages.m,21)*.16+63


labcurve(list(Female=list(ages.f,height.f,col=2),
              Male  =list(ages.m,height.m,col=3,lty='dashed')),
         xlab='Age', ylab='Height', pl=TRUE)
# add ,keys=c('f','m') to label curves with single letters
# For S-Plus use lty=2


# Plot power for testing two proportions vs. n for various odds ratios, 
# using 0.1 as the probability of the event in the control group.  
# A separate curve is plotted for each odds ratio, and the curves are
# labeled at points of maximum separation


n  <- seq(10, 1000, by=10)
OR <- seq(.2,.9,by=.1)
pow <- lapply(OR, function(or,n)list(x=n,y=bpower(p1=.1,odds.ratio=or,n=n)),
              n=n)
names(pow) <- format(OR)
labcurve(pow, pl=TRUE, xlab='n', ylab='Power')


# Plot some random data and find the largest empty rectangle
# that is at least .1 wide and .1 tall


x <- runif(50)
y <- runif(50)
plot(x, y)
z <- largest.empty(x, y, .1, .1)
z
points(z,pch=3)  # mark center of rectangle, or
#key(z$x, z$y, \dots stuff for legend)




# Use the mouse to draw a series of points using one symbol, and
# two smooth curves or straight lines (if two points are clicked), 
# none of these being labeled


# d <- drawPlot(Points(), Curve(), Curve())
# plot(d, file='/tmp/z')  # send result to /tmp/z.ps


\dontrun{
# Use the mouse to draw a Gaussian density, two series of points
# using 2 symbols, one Bezier curve, a step function, and raw data
# along the x-axis as a 1-d scatter plot (rug plot).  Draw a key.
# The density function is fit to 3 mouse clicks
# Abline draws a dotted horizontal reference line
d <- drawPlot(Curve('Normal',type='gauss'),
              Points('female'), Points('male'), 
              Curve('smooth',ask=TRUE,lty=2), Curve('step',type='s',lty=3), 
              Points(type='r'), Abline(h=.5, lty=2),
              xlab='X', ylab='y', xlim=c(0,100), key=TRUE)
plot(d, ylab='Y')
plot(d, key=FALSE)  # label groups using labcurve
}
}
\keyword{hplot}
\keyword{aplot}
\keyword{dplot}
\keyword{iplot}
% Converted by Sd2Rd version 1.21.





\eof
\name{label}
\alias{label}
\alias{label<-}
\alias{labelPlotmath}
\alias{[.labelled}
\alias{print.labelled}
\alias{Label}
\alias{Label.data.frame}
\alias{llist}
\alias{plotmathTranslate}
\alias{as.data.frame.labelled}
\alias{data.frame.labelled}
\alias{reLabelled}
\title{
Label Attribute of an Object
}
\description{
\code{label(x)} retrieves the \code{label} attribute of \code{x}.
\code{label(x) <- "a label"} stores the label attribute, and also puts
the class \code{labelled} as the first class of \code{x} (for S-Plus 5
and later this class is not used and methods for handling this class are
not defined so the \code{"label"} and \code{"units"} attributes are lost
upon subsetting).  The reason for having this class is so that the
subscripting method for \code{labelled}, \code{[.labelled}, can preserve
the \code{label} attribute in R and S-Plus 2000.  Also, the \code{print}
method for \code{labelled} objects prefaces the print with the object's
\code{label} (and \code{units} if there).  If the variable is also given
a \code{"units"} attribute using the \code{units} function, subsetting
the variable (using \code{[.labelled}) will also retain the
\code{"units"} attribute.

\code{label} can optionally append a \code{"units"} attribute to the
string, and it can optionally return a string or expression (for \R's
\code{plotmath} facility) suitable for plotting.  \code{labelPlotmath}
is a function that also has this function, when the input arguments are
the \code{'label'} and \code{'units'} rather than a vector having those
attributes.  When \code{plotmath} mode is used to construct labels, the
\code{'label'} or \code{'units'} may contain math expressions but they
are typed verbatim if they contain percent signs, blanks, or underscores.

\code{Label} (actually \code{Label.data.frame}) is a function which generates
S-Plus source code that makes the labels in all the variables in a data
frame easy to edit. 

\code{llist} is like \code{list} except that it preserves the names or
labels of the component variables in the variables \code{label}
attribute.  This can be useful when looping over variables or using
\code{sapply} or \code{lapply}. By using \code{llist} instead of
\code{list} one can annotate the output with the current variable's name
or label.  \code{llist} also defines a \code{names} attribute for the
list and pulls the \code{names} from the arguments' expressions for
non-named arguments.

\code{plotmathTranslate} is a simple function that translates certain
character strings to character strings that can be used as part of \R
\code{plotmath} expressions.  If the input string has a space or percent
inside, the string is surrounded by a call to \code{plotmath}'s
\code{paste} function.

\code{as.data.frame.labelled} is a utility function that is called by
\code{[.data.frame}.  It is just a copy of \code{as.data.frame.vector}.
\code{data.frame.labelled} is another utility function, that adds a
class \code{"labelled"} to every variable in a data frame that has a
\code{"label"} attribute but not a \code{"labelled"} class.

\code{reLabelled} is used to add a \code{'labelled'} class back to
variables in data frame that have a 'label' attribute but no 'labelled'
oldClass.  Useful for changing \code{cleanup.import()}'d S-Plus 6 data
frames back to general form for \R and S-Plus 2000.
}
\usage{
label(x, units=FALSE, plot=FALSE, default=NULL, grid=FALSE)

label(x) <- value

labelPlotmath(label, units=NULL, plotmath=.R., grid=FALSE)

\method{print}{labelled}(x, \dots)   ## or x - calls print.labelled

Label(object, \dots)

\method{Label}{data.frame}(object, file='', append=FALSE, \dots)

llist(\dots, labels=TRUE)

plotmathTranslate(x)

data.frame.labelled(object)

reLabelled(object)
}
\arguments{
\item{x}{
any object (for \code{plotmathTranslate} is a character string)
}
\item{units}{
  set to \code{TRUE} to append the \code{'units'} attribute (if present)
  to the returned label.  The \code{'units'} are surrounded
  by brackets.  For \code{labelPlotmath} is a character string
  containing the units of measurement.
}
\item{plot}{
  set to \code{TRUE} to return a label suitable for \R's \code{plotmath}
  facility (returns an expression instead of a character string) if R is
  in effect.  If \code{units} is also \code{TRUE}, and if both
  \code{'label'} and \code{'units'} attributes are present, the
  \code{'units'} will appear after the label but in smaller type and
  will not be surrounded by brackets.
}
\item{default}{
  if \code{x} does not have a \code{'label'} attribute and
  \code{default} (a character string) is specified, the label will be
  taken as \code{default}
  }
\item{grid}{
  Currently \R's \code{lattice} and \code{grid} functions do not support
  \code{plotmath} expressions for \code{xlab} and \code{ylab}
  arguments.  When using \code{lattice} functions in \R, set the
  argument \code{grid} to \code{TRUE} so that \code{labelPlotmath} can
  return an ordinary character string instead of an expression.
  }
\item{label}{a character string containing a variable's label}
\item{plotmath}{
  set to \code{TRUE} to have \code{labelMathplot} return an expression
  for plotting using \R's \code{plotmath} facility.  If \R is not in
  effect, an ordinary character string is returned.
  }
\item{value}{
the label of the object, or "".
}
\item{object}{
a data frame
}
\item{\dots}{
a list of variables or expressions to be formed into a \code{list}.
Ignored for \code{print.labelled}.
}
\item{file}{
the name of a file to which to write S-Plus source code.  Default is
\code{""}, meaning standard output.
}
\item{append}{
set to \code{TRUE} to append code generated by \code{Label} to file \code{file}
}
\item{labels}{
set to \code{FALSE} to make \code{llist} ignore the variables' \code{label} attribute and
use the variables' names.
}}
\value{
\code{label} returns the label attribute of x, if any; otherwise, "".  
\code{label} is used
most often for the individual variables in data frames.  The function
\code{sas.get} copies labels over from SAS if they exist.
}
\seealso{
\code{\link{sas.get}}, \code{\link{describe}}
}
\examples{
age <- c(21,65,43)
y   <- 1:3
label(age) <- "Age in Years"
plot(age, y, xlab=label(age))


x1 <- 1:10
x2 <- 10:1
label(x2) <- 'Label for x2'
units(x2) <- 'mmHg'
x2
x2[1:5]
dframe <- data.frame(x1, x2)
Label(dframe)


##In these examples of llist, note that labels are printed after
##variable names, because of print.labelled
a <- 1:3
b <- 4:6
label(b) <- 'B Label'
llist(a,b)
llist(a,b,d=0)
llist(a,b,0)


w <- llist(a, b>5, d=101:103)
sapply(w, function(x){
  hist(as.numeric(x), xlab=label(x))
  # locator(1)   ## wait for mouse click
})

# Or: for(u in w) {hist(u); title(label(u))}
}
\keyword{interface}
\keyword{misc}
\keyword{utilities}
% Converted by Sd2Rd version 1.21.

\eof
\name{latex}
\alias{latex}
\alias{latex.default}
\alias{latex.function}
\alias{latex.list}
\alias{latexTranslate}
\alias{latexSN}
\alias{latexVerbatim}
\alias{dvi}
\alias{print.dvi}
\alias{dvi.latex}
\alias{dvips}
\alias{dvips.latex}
\alias{dvips.dvi}
\alias{dvigv}
\alias{dvigv.latex}
\alias{dvigv.dvi}
\alias{print.latex}
\alias{show.latex}
\alias{show.dvi}
\title{
Convert an S object to LaTeX, and Related Utilities
}
\description{
\code{latex} converts its argument to a \code{.tex} file appropriate
for inclusion in a LaTeX2e document.  \code{latex} is a generic
function that calls one of \code{latex.default},
\code{latex.function}, \code{latex.list}. 

\code{latex.default}
does appropriate rounding and decimal alignment and produces a
file containing a LaTeX tabular environment to print the matrix or data.frame
\code{x} as a table.

\code{latex.function} prepares an S function for printing by issuing \code{sed}
commands that are similar to those in the
\code{S.to.latex} procedure in the \code{s.to.latex} package (Chambers and Hastie, 1993).

\code{latex.list} calls \code{latex} recursively for each element in the argument.

\code{latexTranslate} translates particular items in character
strings to LaTeX format, e.g., makes \code{a^2 = a$^2$} for superscript within
variable labels.  Math mode is inserted as needed.
\code{latexTranslate} assumes that input text always has matches,
e.g. \code{[) [] (] ()}, and that surrounding  by \code{$$} is OK.

\code{latexSN} converts a vector floating point numbers to character
strings using LaTeX exponents.  Dollar signs to enter math mode are not
added.

\code{latexVerbatim} on an object executes the object's \code{print} method,
capturing the output for a file inside a LaTeX verbatim environment.

\code{dvi} uses the system \code{latex} command to compile LaTeX code produced
by \code{latex}, including any needed styles.  \code{dvi}
will put a documentclass\{report\} and end\{document\} wrapper
around a file produced by \code{latex}.  By default, the \code{geometry} LaTeX package is
used to omit all margins and to set the paper size to a default of
5.5in wide by 7in tall.  The result of \code{dvi} is a .dvi file.  To both
format and screen display a non-default size, use for example
\code{print(dvi(latex(x), width=3, height=4),width=3,height=4)}.  Note that
you can use something like \code{xdvi -geometry 460x650 -margins 2.25in
file} without changing LaTeX defaults to emulate this.

\code{dvips} will use the system \code{dvips} command to print the .dvi file to
the default system printer, or create a postscript file if \code{file}
is specified.

\code{dvigv} uses the system \code{dvips} command to convert the input object
to a .dvi file, and uses the system \code{dvips} command to convert it to
postscript.  Then the postscript file is displayed using Ghostview
(assumed to be the system command \code{gv}).

There are \code{show} methods for displaying typeset LaTeX
on the screen using the system \code{xdvi}
command.   If you \code{show} a LaTeX file created by
\code{latex} without running it through \code{dvi} using
\code{show.dvi(object)}, the 
\code{show} method will run it through \code{dvi} automatically.
These \code{show} 
methods are not S Version 4 methods so you have to use full names such
as \code{show.dvi} and \code{show.latex}.  Use the \code{print} methods for
more automatic display of typesetting, e.g. typing \code{latex(x)} will
invoke xdvi to view the typeset document.
}
\usage{
latex(object, title=first.word(deparse(substitute(object))), \dots)

\method{latex}{default}(object,
    title=first.word(deparse(substitute(object))),
    file=paste(title, ".tex", sep=""),
    append=FALSE, 
    label=title,
    rowlabel=title, rowlabel.just="l",
    cgroup=NULL, n.cgroup=NULL, rgroup=NULL, n.rgroup=NULL,
    rowname, cgroup.just=rep("c",length(n.cgroup)),
    colheads=dimnames(cx)[[2]],
    extracolheads=NULL, extracolsize='scriptsize',
    dcolumn=FALSE, numeric.dollar=!dcolumn,
    cdot=FALSE, longtable=FALSE, draft.longtable=TRUE,
    ctable=FALSE, booktabs=FALSE,
    table.env=TRUE, here=FALSE, lines.page=40,
    caption=NULL, caption.lot=NULL, caption.loc=c('top','bottom'),
    double.slash=FALSE,
    vbar=FALSE, collabel.just=rep("c",nc), na.blank=TRUE,
    insert.bottom=NULL, first.hline.double=!(booktabs|ctable),
    where='!tbp', size=NULL,
    center=c('center','centering','none'),
    landscape=FALSE,
    multicol=TRUE,
    \dots) # x is a matrix or data.frame

\method{latex}{function}(
	object,
	title=first.word(deparse(substitute(object))),
	file=paste(title, ".tex", sep=""),
	append=FALSE,
	assignment=TRUE,  type=c('example','verbatim'), \dots)

\method{latex}{list}(
           object,
           title=first.word(deparse(substitute(object))),
           file=paste(title, ".tex", sep=""),
           append=FALSE,
           label,
           caption,
           caption.lot,
           caption.loc=c('top','bottom'),
           \dots)

\method{print}{latex}(x, ...)

latexTranslate(object, inn=NULL, out=NULL, pb=FALSE, \dots)

latexSN(x)

latexVerbatim(x, title=first.word(deparse(substitute(x))),
    file=paste(title, ".tex", sep=""),
    append=FALSE, size=NULL, hspace=NULL,
    width=.Options$width, length=.Options$length, \dots)

dvi(object, \dots)
\method{dvi}{latex}(object, prlog=FALSE, nomargins=TRUE, width=5.5, height=7, \dots)
\method{print}{dvi}(x, \dots)
dvips(object, \dots)
\method{dvips}{latex}(object, \dots)
\method{dvips}{dvi}(object, file, \dots)
\method{show}{latex}(object)  # or show.dvi(object) or just object
dvigv(object, \dots)
\method{dvigv}{latex}(object, \dots)       # or gvdvi(dvi(object))
\method{dvigv}{dvi}(object, \ldots)
}
\arguments{
\item{object}{
For \code{latex}, any S object.  For \code{dvi} or \code{dvigv}, an object
created by \code{latex}.  For \code{latexTranslate} is a vector of
character strings to translate.
}
\item{x}{
any object to be \code{print}ed verbatim for \code{latexVerbatim}.  For
\code{latexSN} \code{x} is a numeric vector.
}
\item{title}{
name of file to create without the \code{.tex} extension.
}
\item{file}{
name of the file to create.  The default file name is \code{x.tex} where
\code{x} is the first word in the name of the argument for \code{x}.
Set \code{file=""} to have the generated LaTeX code just printed to
standard output.  This is especially useful when running under Sweave in
R using its \code{results=tex} tag, to save having to manage many
small external files.  When \code{file=""}, \code{latex} keeps track of
LaTeX styles that are called for by creating or modifying an object
\code{latexStyles} (in \code{.GlobalTemp} in R or in frame 0 in
S-Plus).  \code{latexStyles} is a vector containing the base names of
all the unique LaTeX styles called for so far in the current session.
See the end of the examples section for a way to use this object to good
effect.  For \code{dvips}, \code{file} is the name of an output
postscript file.
}
\item{append}{
defaults to \code{FALSE}. Set to \code{TRUE} to append output to an existing file.
}
\item{label}{
a text string representing a symbolic label for the table for referencing
in the LaTeX \code{\\label} and \code{\\ref} commands.
\code{label} is only used if \code{caption} is given.
}
\item{rowlabel}{
If \code{x} has row dimnames, \code{rowlabel} is a character string containing the
column heading for the row dimnames. The default is the name of the
argument for \code{x}.
}
\item{rowlabel.just}{
If \code{x} has row dimnames, specifies the justification for printing them.
Possible values are \code{"l"}, \code{"r"}, \code{"c"}. The heading (\code{rowlabel}) itself
is left justified if \code{rowlabel.just="l"}, otherwise it is centered.
}
\item{cgroup}{
a vector of character strings defining major column headings. The default is
to have none.
}
\item{n.cgroup}{
a vector containing the number of columns for which each element in
cgroup is a heading.  For example, specify \code{cgroup=c("Major 1","Major 2")},
\code{n.cgroup=c(3,3)} if \code{"Major 1"} is to span columns 1-3 and \code{"Major 2"} is
to span columns 4-6.  \code{rowlabel} does not count in the column numbers.
You can omit \code{n.cgroup} if all groups have the same number of columns.
}
\item{rgroup}{
a vector of character strings containing headings for row groups.
\code{n.rgroup} must be present when \code{rgroup} is given. The first \code{n.rgroup[1]}
rows are sectioned off and \code{rgroup[1]} is used as a bold heading for
them. The usual row dimnames (which must be present if \code{rgroup} is) are 
indented. The next \code{n.rgroup[2]} rows are treated likewise, etc.
}
\item{n.rgroup}{
integer vector giving the number of rows in each grouping. If \code{rgroup}
is not specified, \code{n.rgroup} is just used to divide off blocks of
rows by horizontal lines. If \code{rgroup} is given but \code{n.rgroup} is omitted,
\code{n.rgroup} will default so that each row group contains the same number
of rows.
}
\item{na.blank}{
Set to \code{TRUE} to use blanks rather than \code{NA} for missing values.
This usually looks better in \code{latex}.
}
\item{insert.bottom}{
  an optional character string to typeset at the bottom of the table.
  For \code{"ctable"} style tables, this is placed in an unmarked footnote.
}
\item{first.hline.double}{
set to \code{FALSE} to use single horizontal rules for styles other than
\code{"bookmark"} or \code{"ctable"}
}
\item{rowname}{
rownames for \code{tabular} environment.  Default is rownames of matrix or
data.frame.
}
\item{cgroup.just}{
justification for labels for column groups.  Defaults to \code{"c"}.
}
\item{colheads}{a character vector of column headings if you don't want
  to use \code{dimnames(object)[[2]]}.  Specify \code{colheads=NULL} to
  suppress column headings.}
\item{extracolheads}{
  an optional vector of extra column headings that will appear under the
  main headings (e.g., sample sizes).  This character vector does not
  need to include an empty space for any \code{rowname} in effect, as
  this will be added automatically.  You can also form subheadings by
  splitting character strings defining the column headings using the
  usual backslash \code{n} newline character.}
\item{extracolsize}{
  size for \code{extracolheads} or for any second lines in column names;
  default is \code{"scriptsize"} 
  }
\item{dcolumn} {see \code{format.df}}
\item{numeric.dollar}{
logical, default \code{!dcolumn}.  Set to \code{TRUE} to place dollar
signs around numeric values when \code{dcolumn=FALSE}.  This 
assures that \code{latex} will use minus signs rather than hyphens to indicate
negative numbers.  Set to \code{FALSE} when \code{dcolumn=TRUE}, as
\code{dcolumn.sty} automatically uses minus signs.
}
\item{cdot}{see \code{\link{format.df}}}
\item{longtable}{
Set to \code{TRUE} to use David Carlisle's LaTeX \code{longtable} style, allowing
long tables to be split over multiple pages with headers repeated on
each page.
The \code{"style"} element is set to \code{"longtable"}. The \code{latex} \code{\\usepackage}
must reference \code{[longtable]}.
The file \code{longtable.sty} will
need to be in a directory in your \code{$TEXINPUTS} path.
}
\item{draft.longtable}{
I forgot what this does.
}
\item{ctable}{
  set to \code{TRUE} to use Wybo Dekker's \code{ctable} style from
  \code{CTAN}.  Even though for historical reasons it is not the
  default, it is generally the preferred method.  Thicker but not
  doubled \code{hline}s are used to start a table when \code{ctable} is
  in effect.
  }
\item{booktabs}{
set \code{booktabs=TRUE} to use the \code{booktabs} style of horizontal
rules for better tables.  In this case, double \code{hline}s are not
used to start a table.
}
\item{table.env}{
Set \code{table.env=FALSE} to suppress enclosing the table in a LaTeX
\code{table} environment.  \code{table.env} only applies when
\code{longtable=FALSE}.  You may not specify a \code{caption} if
\code{table.env=FALSE}.
}
\item{here}{
Set to \code{TRUE} if you are using \code{table.env=TRUE} with \code{longtable=FALSE} and you
have installed David Carlisle's \code{here.sty} LaTeX style. This will cause
the LaTeX \code{table} environment to be set up with option \code{H} to guarantee
that the table will appear exactly where you think it will in the text.
The \code{"style"} element is set to \code{"here"}. The \code{latex} \code{\\usepackage}
must reference \code{[here]}.  The file \code{here.sty} will
need to be in a directory in your \code{$TEXINPUTS} path.  \code{here} is
largely obsolete with LaTeX2e.
}
\item{lines.page}{
Applies if \code{longtable=TRUE}. No more than \code{lines.page} lines in the body
of a table will be placed on a single page. Page breaks will only
occur at \code{rgroup} boundaries.
}
\item{caption}{
a text string to use as a caption to print at the top of the first
page of the table. Default is no caption.
}
\item{caption.lot}{
a text string representing a short caption to be used in the "List of Tables".
By default, LaTeX will use \code{caption}.  If you get inexplicable \code{latex} errors,
you may need to supply \code{caption.lot} to make the errors go away.
}
\item{caption.loc}{set to \code{"bottom"} to position a caption below
the table instead of the default of \code{"top"}.}
\item{double.slash}{
set to \code{TRUE} to output \code{\\} as \code{\\\\} in LaTeX commands. Useful when you
are reading the output file back into an S vector for later output.
}
\item{vbar}{
logical. When \code{vbar==TRUE}, columns in the tabular environment are separated with
vertical bar characters.  When \code{vbar==FALSE}, columns are separated with white
space.  The default, \code{vbar==FALSE}, produces tables consistent with the style sheet
for the Journal of the American Statistical Association.
}
\item{collabel.just}{
justification for column labels.
}
\item{assignment}{
logical.  When \code{TRUE}, the default, the name of the function and the assignment
arrow are printed to the file.
}
\item{where}{
specifies placement of floats if a table environment is used.  Default
is \code{"!tbp"}.  To allow tables to appear in the middle of a page of
text you might specify \code{where="!htbp"} to \code{latex.default}.
}
\item{size}{
size of table text if a size change is needed (default is no change).
For example you might specify \code{size="small"} to use LaTeX font size
"small".
}
\item{center}{
default is \code{"center"} to enclose the table in a \code{center}
environment.  Use \code{center="centering"} to instead use a LaTeX
\code{centering} directive, or \code{center="none"} to use no
centering.  This option was implemented by Markus Jntti
\email{markus.jantti@iki.fi} of Abo Akademi University.
}
\item{landscape}{
 set to \code{TRUE} to enclose the table in a \code{landscape}
 environment.  When \code{ctable} is \code{TRUE}, will use the
 \code{rotate} argument to \code{ctable}.
}
\item{type}{
The default uses the S \code{Example} environment for \code{latex.function},
assuming you have installed \code{S.sty} in a location that the system
latex command automatically accesses.  Set \code{type="verbatim"} to
instead use the LaTeX \code{verbatim} environment.
}
\item{\dots}{
other arguments are accepted and ignored except that \code{latex}
passes arguments to \code{format.df}.  For \code{latexVerbatim} these
arguments are passed to the \code{print} function.  Ignored for
\code{latexTranslate}.
}
\item{inn, out}{
specify additional input and translated strings over the usual
defaults
}
\item{pb}{
If \code{pb}=\code{TRUE}, \code{latexTranslate} also translates \code{[()]} to math mode using
\code{\left, \right}.
}
\item{hspace}{
horizontal space, e.g., extra left margin for verbatim text.  Default
is none.  Use e.g. \code{hspace="10ex"} to add 10 extra spaces to the left
of the text.
}
\item{length}{for S-Plus only; is the length of the output page for
  printing and capturing verbatim text}
\item{width}{}
\item{height}{
are the \code{options( )} to have in effect only for when \code{print} is
executed.  Defaults are current \code{options}.  For \code{dvi} these specify
the paper width and height in inches if \code{nomargins=TRUE}, with
defaults of 5.5 and 7, respectively.
}
\item{prlog}{
set to \code{TRUE} to have \code{dvi} print, to the S-Plus session, the LaTeX .log
file.
}
\item{multicol}{
set  to \code{FALSE} to not use \code{\multicolumn} in header
of table
}

\item{nomargins}{
set to \code{FALSE} to use default LaTeX margins when making the .dvi file
}
}
\value{
\code{latex} and \code{dvi} return a
list of class \code{latex} or \code{dvi} containing character string
elements \code{file} and \code{style}.  \code{file} contains the name of the
generated file, and \code{style} is a vector (possibly empty) of styles to
be included using the LaTeX2e \code{\usepackage} command.

\code{latexTranslate} returns a vector of character strings
}

\section{Side Effects}{
creates various system files and runs various Linux/UNIX system
commands which are assumed to be in the system path.
}
\details{
  If running under Windows and using MikTeX, \code{latex} and \code{yap}
  must be in your system path, and \code{yap} is used to browse
  \code{.dvi} files created by \code{latex}.  You should install the
  \code{geometry} and \code{ctable} styles in MikTeX to make optimum use
  of \code{latex()}.

  If running S-Plus and your directory for temporary files is not
  \code{/tmp} (Unix/Linux) or \code{\\windows\\temp} (Windows), add your
  own \code{tempdir} function such as \code{
	tempdir <- function() "/yourmaindirectory/yoursubdirectory"}
  }
\author{
Frank E. Harrell, Jr.,
\cr
Department of Biostatistics,
\cr
Vanderbilt University,
\cr
\code{f.harrell@vanderbilt.edu}


Richard M. Heiberger,
\cr
Department of Statistics,
\cr
Temple University, Philadelphia, PA.
\cr
\code{rmh@astro.ocis.temple.edu}


}
\seealso{
\code{\link{html}}, \code{\link{format.df}}
}
\examples{
\dontrun{
x <- matrix(1:6, nrow=2, dimnames=list(c('a','b'),c('c','d','enLine 2')))
latex(x)   # creates x.tex in working directory
w <- latex(x, file='/tmp/my.tex')
d <- dvi(w)  # compile LaTeX document, make .dvi
             # latex assumed to be in path
d            # or show(d) : run xdvi (assumed in path) to display
w            # or show(w) : run dvi then xdvi
dvips(d)     # run dvips to print document
dvips(w)     # run dvi then dvips
latex(x, file="")   # just write out LaTeX code to screen

# After running latex( ) multiple times with different special styles in
# effect, make a file that will call for the needed LaTeX packages when
# latex is run (especially when using Sweave with R)
if(exists(latexStyles))
  cat(paste('\\usepackage{',latexStyles,'}',sep=''),
      file='stylesused.tex', sep='\n')
# Then in the latex job have something like:
# \documentclass{article}
# \input{stylesused}
# \begin{document}
# ...
}
}
\keyword{utilities}
\keyword{interface}
\keyword{methods}
\keyword{file}
\keyword{character}
\keyword{manip}

\eof
\name{ldBands}
\alias{ldBands}
\alias{summary.ldBands}
\alias{print.ldBands}
\alias{plot.ldBands}
\alias{print.summary.ldBands}
\title{Group Sequential Boundaries using the Lan-DeMets Approach}
\description{
This function computes and plots group sequential stopping boundaries
from the Lan-DeMets method with a variety of \eqn{\alpha}-spending
functions using the \code{ld98} program from the Department of
Biostatistics, University of Wisconsin written by DM Reboussin, DL
DeMets, KM Kim, and KKG Lan.  Such stopping boundaries are
useful for early termination of clinical trials for safety problems or
for efficacy.  Simple \code{plot} and \code{print} methods are
implemented.  Simple sample size and minimally detectable effect sizes
given sample sizes may be obtained with a \code{summary} method if
\code{power} was specified to \code{ldBands}.  Alternatively,
\code{summary} computes, for each look, the difference in means that
must be achieved to cross a boundary if \code{n} and \code{sd} are
specified, or the minimum difference in proportions and the odds ratios
that must be achieved to cross a boundary if \code{n} and \code{pbar}
are specified.
}
\usage{
ldBands(n = length(times), times = NULL, alpha = 0.05, sided = 2,
        alphaLower=alpha/2, alphaUpper=alpha/2,
        information = NULL,
        spending=c('OBrien-Fleming','Pocock','alpha*t^phi',
                   'Hwang-Shih-DeCani'),
        phi=1,
        spending2=c('OBrien-Fleming','Pocock','alpha*t^phi',
                    'Hwang-Shih-DeCani'),
        phi2=phi,
        truncate = Inf, power = NULL, pr = TRUE)
\method{print}{ldBands}(x, \dots)
\method{plot}{ldBands}(x, xlab='Time', ylab='Z', actual=NULL,
        type='b', labels=NULL, \dots)

\method{summary}{ldBands}(object, stdiff=NULL, n=NULL,
        p1=NULL, p2=NULL,  hr=NULL, events=NULL,
        pbar=NULL, sd=NULL, \dots)
\method{print}{summary.ldBands}(x, \dots)
}
\arguments{
  \item{n}{number of interim analyses.  If \code{times} is given, is
	automatically taken as the length of \code{times}.
  For \code{summary.ldBands}, \code{n} is the sample size, to obtain
  detectable standardized difference.}
  \item{times}{times at which interim analyses are done}
  \item{alpha}{overall \eqn{\alpha} level for the multiple tests.
	Default is \code{0.05}.  If \code{sided=3} is the \eqn{\alpha}-level
  for the lower bounds, otherwise is the total \eqn{\alpha}.}
  \item{sided}{set to \code{1} to use a one-sided test, \code{3} for
	asymmetric two-sided bounds}
  \item{alphaLower}{\eqn{\alpha}-level for lower bound if
	\code{sided=3}.  Defaults to \eqn{\alpha/2}.  When \code{sided=3}
	\code{alpha} is recalculated from \code{alphaLower+alphaUpper}.}
  \item{alphaUpper}{\eqn{\alpha}-level for upper bound if
	\code{sided=3}.  Defaults to \eqn{\alpha/2}.}
  \item{information}{a vector of information times if different from
	\code{times}.  Used for computing covariances of test statistics.}
  \item{spending}{an \eqn{\alpha} spending function.  Default is
	O'Brien-Fleming function.  If \code{sided=3} is the spending
	function for the lower bounds.}
  \item{phi}{parameter for the third and fourth types of spending
	functions (exponent of time for the third, default is \code{1})}
  \item{spending2}{spending function for the upper bounds if
	\code{sided=3}.  Defaults to same spending function for lower
	bounds.}
  \item{phi2}{parameter for third and fourth spending functions if
	\code{sided=3}.  Default is \code{phi}.}
  \item{truncate}{a value at which to truncate \eqn{Z} critical values
	so that early looks will have some power without really affecting
	the overall \eqn{\alpha} level.  Default is no truncation.}
  \item{power}{A power for which to compute a drift parameter; useful in
  sample size calculations}
  \item{pr}{set to \code{FALSE} to supress the actual output of the
	\code{ld98} program}
  \item{x}{an object created by \code{ldBands} or \code{summary.ldBands}}
  \item{xlab}{\eqn{x}-axis label}
  \item{ylab}{\eqn{y}-axis label}
  \item{actual}{an optional list with two components: times and \eqn{z}
	values to add as points to the first plot}
  \item{type}{default is \code{"b"} causing both points and lines to be
	drawn for boundaries.  Use \code{type="l"} for example to suppress
	drawing circles at points.}
  \item{labels}{an optional character vector to be used to label time
    points corresponding to those generated by \code{ldBands}}
  \item{object}{an object created by \code{ldBands}}
  \item{stdiff}{standardized difference to detect}
  \item{p1}{probability of event in group 1}
  \item{p2}{probability of event in group 2, to compare with \code{p1}
	in order to obtain sample size for two-sample binomial}
  \item{hr}{hazard ratio to detect, to obtain needed number of events at
	end of study for either treatment arm using the logrank test}
  \item{events}{number of events per treatment arm at end of study, to
	obtain detectable hazard ratio}
  \item{pbar}{mean of actual probabilities of events in two treatment
	arms}
  \item{sd}{standard deviation of an observation}
  \item{\dots}{unused}
}
\details{
This function assumes that you have stored the \code{ld98} executable
in a subdirectory that is in your system path.  Obtain \code{ld98}
program from the URL given in the reference below.

The \code{plot} method makes two plots if \code{power} is specified, the
second containing exit and cumulative exit probabilities as a function
of time.  If \code{par(mfrow=c())} is not already set to something besides
\code{c(1,1)}, \code{par(mfrow=c(2,1))} is set temporarily.
}
\value{
  A list of class \code{"ldBands"} with the following components.  When
  \code{summary} is used and \code{n} and either \code{pbar} or
  \code{sd} are given, \code{summary} returns the same object returned
  by \code{ldBands} but with possible components
  \code{diff.lower,diff.lower} (needed difference means or proportions) and
  \code{or.lower,or.upper} (needed odds ratios).
  \item{data}{a data frame containing the main calculations}
  \item{power}{power specified, if any}
  \item{drift}{drift calculated, if \code{power} specified}
  \item{type}{\code{"boundaries"} if \code{power} not specified,
	\code{"power"} otherwise}
  \item{n}{number of interim analyses, for \code{ldBands}.  For
	\code{summary} is the number of observations per arm, and it must be
	a vector with length equal to the number of looks if \code{pbar} or
	\code{sd} are given.}
  \item{alpha}{input value of \code{alpha}}
  \item{sided}{\code{1-3}}
  \item{alphaLower}{lower \eqn{\alpha}}
  \item{alphaUpper}{upper \eqn{\alpha}}
  \item{spending}{name of \eqn{\alpha} spending function used}
  \item{phi}{parameter for spending function}
  \item{spending2}{name of spending function for upper boundary.
	Defaults to \code{spending}}
  \item{phi2}{parameter for second spending function.  Defaults to
	\code{phi} but is ignored if \code{spending2} is the first or second
	type of spending function.}
  \item{truncate}{value at which \eqn{Z} statistics truncated (default
	is \code{Inf})}
  }
  \references{
	Reboussin DM, DeMets DL, Kim K-M, Lan KKG (1996): Programs
  for computing group sequential boundaries using the Lan-DeMets
  method.  \url{http://www.medsch.wisc.edu/landemets}

  Reboussin DM, DeMets DL, Kim K, Lan KKG (2000): Computations for group
  sequential boundaries using the Lan-DeMets spending function method.
  Controlled Clinical Trials 21:190-207.
  }
\author{Frank E Harrell Jr}
\seealso{\code{\link{gbayes}}}

\examples{
\dontrun{
# Get boundaries for O'Brien-Fleming spending function, 5 looks, alpha=.05
b <- ldBands(5, pr=FALSE)
plot(b)
# Same but times are irregular, and information times are different than
# test times.  Use Pocock spending function.
b <- ldBands(times=      c(.4, .6, .8, .9, .95),
             information=c(.42,.65,.83,.89,.94), spending='Pocock')

# Get power calculations
u <- ldBands(times=c(.4, .6, .8, .9, .95),  power=.9)
u$drift                  # standardize difference * sqrt(n per arm)
                         # needed to provide power=.9
summary(u, n=50)         # obtain detectable standardized difference
summary(u, p1=.4, p2=.5) # get sample size per arm, two-sample binomial
summary(u, hr=1.5)       # get number of events per arm needed
                         # to detect a hazard ratio of 1.5

# Asymmetric boundaries with different spending functions, truncate
b <- ldBands(5, sided=3, spending='alpha*t^phi', phi=1, phi2=1.5,
             alphaLower=.01, alphaUpper=.04, truncate=4)
b
plot(b)
# Compute differences in proportions and odds ratios needed to cross
# the boundaries, given a mean probability in two treatment arms of 0.1
# and given a vector of sample sizes per arm corresponding to the looks
s <- summary(b, n=seq(200,1000,by=200), pbar=.1)
s
d <- s$data
plot(d$n, d$or.lower, xlab='N Per Arm',
     ylab='Critical Odds Ratio', type='b',
     ylim=range(d$or.lower, d$or.upper), log='y')
lines(d$n, d$or.upper, type='b')
abline(h=1, lty=2)
}
}
\keyword{distribution}
\keyword{htest}
\keyword{design}

\eof
\name{list.tree}
\alias{list.tree}
\title{
Pretty-print the Structure of a Data Object
}
\description{
This is a function to pretty-print the structure of any data object
(usually a list).  It is similar to the R function \code{str}.
}
\usage{
list.tree(struct, depth=-1, numbers=FALSE, maxlen=22, maxcomp=12, 
          attr.print=TRUE, front="", fill=". ", name.of, size=TRUE)
}
\arguments{
\item{struct}{
The object to be displayed
}
\item{depth}{
Maximum depth of recursion (of lists within lists \dots) to be printed; negative
value means no limit on depth.
}
\item{numbers}{
If TRUE, use numbers in leader  instead  of  dots  to
represent position in structure.
}
\item{maxlen}{
Approximate maximum length (in characters) allowed on each line to give the
first few values of a vector.  maxlen=0 suppresses printing any values.
}
\item{maxcomp}{
Maximum number of components of any list that will be described.
}
\item{attr.print}{
Logical flag, determining whether a description of attributes will be printed.
}
\item{front}{
Front material of a line, for internal use.
}
\item{fill}{
Fill character used for each level of indentation.
}
\item{name.of}{
Name of object, for internal use (deparsed version  of  struct  by  default). 
}
\item{size}{
Logical flag, should the size of the object in bytes be printed?


A description of the structure of struct will be printed in outline
form, with indentation
for each level of recursion, showing the internal storage mode, length,
class(es) if any, attributes, and first few elements of each data vector.
By default each level of list recursion is indicated by a "." and 
attributes by "A".
}}
\seealso{
\code{\link{str}}
}
\examples{
X <- list(a=ordered(c(1:30,30:1)),b=c("Rick","John","Allan"),
          c=diag(300),e=cbind(p=1008:1019,q=4))
list.tree(X)
# In R you can say str(X)
}
\author{
Alan Zaslavsky, zaslavsk@hcp.med.harvard.edu
}
\keyword{documentation}
% Converted by Sd2Rd version 1.21.



\eof
\name{mgp.axis}
\alias{mgp.axis}
\alias{mgp.axis.labels}
\title{Draw Axes With Side-Specific mgp Parameters}
\description{
\code{mgp.axis} is a version of \code{axis} that uses the appropriate
side-specific \code{mgp} parameter (see \code{\link{par}}) to account
for different space requirements for axis labels vertical vs. horizontal
tick marks.  \code{mgp.axis} also fixes a bug in \code{axis(2,\dots)}
that causes it to assume \code{las=1}.

\code{mgp.axis.labels} is used so that different spacing between tick
marks and axis tick mark labels may be specified for x- and y-axes.  Use
\code{mgp.axis.labels('default')} to set defaults. Users can set values
manually using \code{mgp.axis.labels(x,y)} where \code{x} and \code{y}
are 2nd value of \code{par('mgp')} to use.  Use
\code{mgp.axis.labels(type=w)} to retrieve values, where \code{w='x'},
\code{'y'}, \code{'x and y'}, \code{'xy'}, to get 3 \code{mgp} values
(first 3 types) or 2 \code{mgp.axis.labels}.
}
\usage{
mgp.axis(side, at = NULL, \dots,
         mgp = mgp.axis.labels(type = if (side == 1 | side == 3) "x"
                               else "y"),
         axistitle = NULL)

mgp.axis.labels(value,type=c('xy','x','y','x and y'))
}

\arguments{
  \item{side}{}
  \item{at}{see \code{\link{par}}}
  \item{\dots}{arguments passed through to \code{\link{axis}}}
  \item{mgp}{see \code{\link{par}}}
  \item{axistitle}{if specified will cause \code{axistitle} to be drawn
	on the appropriate axis as a title}
  \item{value}{vector of values to which to set system option
	\code{mgp.axis.labels}}
  \item{type}{see above}
}
\section{Side Effects}{\code{mgp.axis.labels} stores the value in the
  system option \code{mgp.axis.labels}}
\value{
  \code{mgp.axis.labels} returns the value of \code{mgp} (only the
  second element of \code{mgp} if \code{type="xy"} or a list with
  elements \code{x} and \code{y} if \code{type="x or y"}, each list
  element being a 3-vector) for the 
  appropriate axis if \code{value} is not specified, otherwise it
  returns nothing but the system option \code{mgp.axis.labels} is set.

  \code{mgp.axis} returns nothing.
}
\author{Frank Harrell}
\seealso{\code{\link{par}}}
\examples{
\dontrun{
mgp.axis.labels(type='x')  # get default value for x-axis
mgp.axis.labels(type='y')  # get value for y-axis
mgp.axis.labels(type='xy') # get 2nd element of both mgps
mgp.axis.labels(type='x and y')  # get a list with 2 elements
mgp.axis.labels(c(3,.5,0), type='x')  # set
options('mgp.axis.labels')            # retrieve

plot(..., axes=FALSE)
mgp.axis(1, "X Label")
mgp.axis(2, "Y Label")

}}
\keyword{iplot}
\keyword{dplot}
\keyword{environment}

\eof
\name{minor.tick}
\alias{minor.tick}
\title{
Minor Tick Marks
}
\description{
Adds minor tick marks to an existing plot.  All minor tick marks that
will fit on the axes will be drawn.
}
\usage{
minor.tick(nx=2, ny=2, tick.ratio=0.5)
}
\arguments{
\item{nx}{
number of intervals in which to divide the area between major tick marks on
the X-axis.  Set to 1 to suppress minor tick marks.
}
\item{ny}{
same as \code{nx} but for the Y-axis
}
\item{tick.ratio}{
ratio of lengths of minor tick marks to major tick marks.  The length
of major tick marks is retrieved from \code{par("tck")}.
}}
\section{Side Effects}{
plots
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{axis}}
}
\examples{
plot(runif(20),runif(20))
minor.tick()
}
\keyword{aplot}
\keyword{hplot}
% Converted by Sd2Rd version 1.21.

\eof
\name{mtitle}
\alias{mtitle}
\title{
Margin Titles
}
\description{
Writes overall titles and subtitles after a multiple image plot is drawn.
If \code{par()$oma==c(0,0,0,0)}, \code{title} is used instead of \code{mtext}, to draw
titles or subtitles that are inside the plotting region for a single plot.
}
\usage{
mtitle(main, ll, lc,  
       lr=if(.R.) format(Sys.time(),'\%d\%b\%y') else
        if(under.unix)unix("date '+\%d\%h\%y'") else date(), 
       cex.m=1.75, cex.l=.5, \dots)
none
}
\arguments{
\item{main}{
main title to be centered over entire figure, default is none
}
\item{ll}{
subtitle for lower left of figure, default is none
}
\item{lc}{
subtitle for lower center of figure, default is none
}
\item{lr}{
subtitle for lower right of figure, default is today's date in format
23Jan91 for UNIX or R (Thu May 30 09:08:13 1996 format for Windows). 
Set to \code{""} to suppress lower right title.
}
\item{cex.m}{
character size for main, default is 1.75
}
\item{cex.l}{
character size for subtitles
}
\item{...}{
other arguments passed to \code{mtext}
}}
\value{
nothing
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\section{Side Effects}{
plots
}
\seealso{
\code{\link{par}}, \code{\link{mtext}}, \code{\link{title}}, \code{\link{unix}}, \code{\link{pstamp}}
}
\examples{
#Set up for 1 plot on figure, give a main title,
#use date for lr
plot(runif(20),runif(20))
mtitle("Main Title")


#Set up for 2 x 2 matrix of plots with a lower left subtitle and overall title
par(mfrow=c(2,2), oma=c(3,0,3,0))
plot(runif(20),runif(20))
plot(rnorm(20),rnorm(20))
plot(exp(rnorm(20)),exp(rnorm(20)))
mtitle("Main Title",ll="n=20")
}
\keyword{hplot}
% Converted by Sd2Rd version 1.21.

\eof
\name{na.delete}
\alias{na.delete}
\title{
Row-wise Deletion na.action
}
\description{
Does row-wise deletion as \code{na.omit}, but adds frequency of missing values
for each predictor
to the \code{"na.action"} attribute of the returned model frame.
Optionally stores further details if \code{options(na.detail.response=TRUE)}. 
}
\usage{
na.delete(frame)
}
\arguments{
\item{frame}{
a model frame
}}
\value{
a model frame with rows deleted and the \code{"na.action"} attribute added.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{na.omit}}, \code{\link{na.keep}}, \code{\link{na.detail.response}}, \code{\link{model.frame.default}},
\code{\link{naresid}}, \code{\link{naprint}}
}
\examples{
# options(na.action="na.delete")
# ols(y ~ x)
}
\keyword{models}
% Converted by Sd2Rd version 1.21.

\eof
\name{na.detail.response}
\alias{na.detail.response}
\title{
Detailed Response Variable Information
}
\description{
This function is called by certain \code{na.action} functions if
\code{options(na.detail.response=TRUE)} is set.  By default, this function
returns a matrix of counts of non-NAs and the mean of the response variable
computed separately by whether or not each predictor is NA.  The default
action uses the last column of a \code{Surv} object, in effect computing the
proportion of events.  Other summary functions may be specified by
using \code{options(na.fun.response="name of function")}.
}
\usage{
na.detail.response(mf)
}
\arguments{
\item{mf}{
a model frame
}}
\value{
a matrix, with rows representing the different statistics that are
computed for the response, and columns representing the different
subsets for each predictor (NA and non-NA value subsets).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{na.omit}}, \code{\link{na.delete}}, \code{\link{model.frame.default}}, 
\code{\link{naresid}}, \code{\link{naprint}}, \code{\link{describe}}
}
\examples{
# sex
# [1] m f f m f f m m m m m m m m f f f m f m
# age
# [1] NA 41 23 30 44 22 NA 32 37 34 38 36 36 50 40 43 34 22 42 30
# y
# [1] 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 1 0 1 0 0
# options(na.detail.response=TRUE, na.action="na.delete", digits=3)
# lrm(y ~ age*sex)
#
# Logistic Regression Model
# 
# lrm(formula = y ~ age * sex)
#
#
# Frequencies of Responses
#   0 1 
#  10 8
#
# Frequencies of Missing Values Due to Each Variable
#  y age sex 
#  0   2   0
#
#
# Statistics on Response by Missing/Non-Missing Status of Predictors
#
#     age=NA age!=NA sex!=NA Any NA  No NA 
#   N    2.0  18.000   20.00    2.0 18.000
# Mean    0.5   0.444    0.45    0.5  0.444
#
# \dots\dots
# options(na.action="na.keep")
# describe(y ~ age*sex)
# Statistics on Response by Missing/Non-Missing Status of Predictors
#
#      age=NA age!=NA sex!=NA Any NA  No NA 
#    N    2.0  18.000   20.00    2.0 18.000
# Mean    0.5   0.444    0.45    0.5  0.444
#
# \dots
# options(na.fun.response="table")  #built-in function table()
# describe(y ~ age*sex)
#
# Statistics on Response by Missing/Non-Missing Status of Predictors
#
#   age=NA age!=NA sex!=NA Any NA No NA 
# 0      1      10      11      1    10
# 1      1       8       9      1     8
#
# \dots
}
\keyword{models}
\keyword{regression}
% Converted by Sd2Rd version 1.21.

\eof
\name{na.keep}
\alias{na.keep}
\title{
Do-nothing na.action
}
\description{
Does not delete rows containing NAs, but does add details concerning
the distribution of the response variable if \code{options(na.detail.response=TRUE)}.
This \code{na.action} is primarily for use with \code{describe.formula}. 
}
\usage{
na.keep(mf)
}
\arguments{
\item{mf}{
a model frame
}}
\value{
the same model frame with the \code{"na.action"} attribute


}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{na.omit}}, \code{\link{na.delete}}, \code{\link{model.frame.default}}, \code{\link{na.detail.response}},
\code{\link{naresid}}, \code{\link{naprint}}, \code{\link{describe}}
}
\examples{
options(na.action="na.keep", na.detail.response=TRUE)
x1 <- runif(20)
x2 <- runif(20)
x2[1:4] <- NA
y <- rnorm(20)
describe(y ~ x1*x2)
}
\keyword{models}
% Converted by Sd2Rd version 1.21.

\eof
\name{panel.bpplot}
\alias{panel.bpplot}
\alias{bpplt}
\title{
Box-Percentile Panel Function for Trellis
}
\description{
For all their good points, box plots have a high ink/information ratio
in that they mainly display 3 quartiles.  Many practitioners have
found that the "outer values" are difficult to explain to
non-statisticians and many feel that the notion of "outliers" is too
dependent on (false) expectations that data distributions should be Gaussian.

\code{panel.bpplot} is a \code{panel} function for use with \code{trellis}, especially for
\code{bwplot}.  It draws box plots (without the whiskers) with any number
of user-specified "corners" (corresponding to different quantiles),
but it also draws box-percentile plots similar to those drawn by
Jeffrey Banfield's (umsfjban@bill.oscs.montana.edu) \code{bpplot} function.
To quote from Banfield, "box-percentile plots supply more
information about the univariate distributions.  At any height the
width of the irregular 'box' is proportional to the percentile of that
height, up to the 50th percentile, and above the 50th percentile the
width is proportional to 100 minus the percentile.  Thus, the width at
any given height is proportional to the percent of observations that
are more extreme in that direction.  As in boxplots, the median, 25th
and 75th percentiles are marked with line segments across the box."

\code{panel.bpplot} is a generalization of \code{bpplot} and
\code{panel.bwplot} in 
that it works with \code{trellis} (making the plots horizontal so that
category labels are more visable), it allows the user to specify the
quantiles to connect and those for which to draw reference lines, 
and it displays means (by default using dots).

\code{bpplt} draws horizontal box-percentile plot much like those drawn
by \code{panel.bpplot} but taking as the starting point a matrix
containing quantiles summarizing the data.  \code{bpplt} is primarily
intended to be used internally by \code{plot.summary.formula.reverse}
but when used with no arguments has a general purpose: to draw an
annotated example box-percentile plot with the default quantiles used
and with the mean drawn with a solid dot.  This schematic plot is
rendered nicely in postscript with an image height of 3.5 inches.
}
\usage{
panel.bpplot(x, y, box.ratio=1, means=TRUE, qref=c(.5,.25,.75),
             probs=c(.05,.125,.25,.375), nout=0,
             datadensity=FALSE, scat1d.opts=NULL,
             font=box.dot$font, pch=box.dot$pch, 
             cex =box.dot$cex,  col=box.dot$col, \dots)

# E.g. bwplot(formula, panel=panel.bpplot, panel.bpplot.parameters)

bpplt(stats, xlim, xlab='', box.ratio = 1, means=TRUE,
      qref=c(.5,.25,.75), qomit=c(.025,.975),
      pch=16, cex.labels=par('cex'), cex.points=if(prototype)1 else 0.5,
      grid=FALSE)
}
\arguments{
\item{x}{
continuous variable whose distribution is to be examined
}
\item{y}{
grouping variable
}
\item{box.ratio}{
see \code{panel.bwplot}
}
\item{means}{
set to \code{FALSE} to suppress drawing a character at the mean value
}
\item{qref}{
vector of quantiles for which to draw reference lines.  These do not
need to be included in \code{probs}.
}
\item{probs}{
vector of quantiles to display in the box plot.  These should all be
less than 0.5; the mirror-image quantiles are added automatically.  By
default, \code{probs} is set to \code{c(.05,.125,.25,.375)} so that intervals
contain 0.9, 0.75, 0.5, and 0.25 of the data.
To draw all 99 percentiles, i.e., to draw a box-percentile plot,
set \code{probs=seq(.01,.49,by=.01)}.
To make a more traditional box plot, use \code{probs=.25}.
}
\item{nout}{
tells the function to use \code{scat1d} to draw tick marks showing the
\code{nout} smallest and \code{nout} largest values if \code{nout >= 1}, or to
show all values less than the \code{nout} quantile or greater than the
\code{1-nout} quantile if \code{0 < nout <= 0.5}.  If \code{nout} is a whole number,
only the first \code{n/2} observations are shown on either side of the
median, where \code{n} is the total number of observations. 
}
\item{datadensity}{
set to \code{FALSE} to invoke \code{scat1d} to draw a data density (one-dimensional
scatter diagram or rug plot) inside each box plot.
}
\item{scat1d.opts}{
a list containing named arguments (without abbreviations) to pass to
\code{scat1d} when \code{datadensity=TRUE} or \code{nout > 0}
}
\item{font}{}
\item{pch}{}
\item{cex}{}
\item{col}{see \code{\link{panel.bwplot}}}
\item{\dots}{arguments passed to \code{points}}
\item{stats}{}
\item{xlim}{}
\item{xlab}{}
\item{qomit}{}
\item{cex.labels}{}
\item{cex.points}{}
\item{grid}{undocumented arguments to \code{bpplt}}
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Esty, W. W. and Banfield, J. D. (1992)
"The Box-Percentile Plot,"
Technical Report (May 15, 1992),
Department of Mathematical Sciences,
Montana State University.
}
\seealso{
\code{\link{bpplot}}, \code{\link{panel.bwplot}}, \code{\link{scat1d}}, \code{\link{quantile}}, \code{\link{ecdf}}
}
\examples{
set.seed(13)
x <- rnorm(1000)
g <- sample(1:6, 1000, replace=TRUE)
x[g==1][1:20] <- rnorm(20)+3   # contaminate 20 x's for group 1


# default trellis box plot
if(.R.) library(lattice)
bwplot(g ~ x)


# box-percentile plot with data density (rug plot)
bwplot(g ~ x, panel=panel.bpplot, probs=seq(.01,.49,by=.01), datadensity=TRUE)
# add ,scat1d.opts=list(tfrac=1) to make all tick marks the same size
# when a group has > 125 observations


# small dot for means, show only .05,.125,.25,.375,.625,.75,.875,.95 quantiles
bwplot(g ~ x, panel=panel.bpplot, cex=.3)


# suppress means and reference lines for lower and upper quartiles
bwplot(g ~ x, panel=panel.bpplot, probs=c(.025,.1,.25), means=FALSE, qref=FALSE)


# continuous plot up until quartiles ("Tootsie Roll plot")
bwplot(g ~ x, panel=panel.bpplot, probs=seq(.01,.25,by=.01))


# start at quartiles then make it continuous ("coffin plot")
bwplot(g ~ x, panel=panel.bpplot, probs=seq(.25,.49,by=.01))


# same as previous but add a spike to give 0.95 interval
bwplot(g ~ x, panel=panel.bpplot, probs=c(.025,seq(.25,.49,by=.01)))


# decile plot with reference lines at outer quintiles and median
bwplot(g ~ x, panel=panel.bpplot, probs=c(.1,.2,.3,.4), qref=c(.5,.2,.8))


# default plot with tick marks showing all observations outside the outer
# box (.05 and .95 quantiles), with very small ticks
bwplot(g ~ x, panel=panel.bpplot, nout=.05, scat1d.opts=list(frac=.01))


# show 5 smallest and 5 largest observations
bwplot(g ~ x, panel=panel.bpplot, nout=5)


# Use a scat1d option (preserve=TRUE) to ensure that the right peak extends 
# to the same position as the extreme scat1d
bwplot(~x , panel=panel.bpplot, probs=seq(.00,.5,by=.001), 
       datadensity=TRUE, scat1d.opt=list(preserve=TRUE))

# Draw a prototype showing how to interpret the plots
bpplt()

# make a local copy of bwplot that always uses panel.bpplot (S-Plus only)
# bwplot$panel <- panel.bpplot
# bwplot(g ~ x, nout=.05)
}
\keyword{nonparametric}
\keyword{hplot}
\keyword{distribution}
\concept{trellis}
\concept{lattice}

\eof
\name{pc1}
\alias{pc1}
\title{First Principal Component}
\description{
Given a numeric matrix which may or may not contain \code{NA}s,
\code{pc1} standardizes the columns to have mean 0 and variance 1 and
computes the first principal component using \code{\link{prcomp}}.  The
proportion of variance explained by this component is printed, and so
are the coefficients of the original (not scaled) variables.  These
coefficients may be applied to the raw data to obtain the first PC.
}
\usage{
pc1(x, hi)
}
\arguments{
  \item{x}{numeric matrix}
  \item{hi}{if specified, the first PC is scaled so that its maximum
	value is \code{hi} and its minimum value is zero}
}
\value{
  The vector of observations with the first PC.  An attribute
  \code{"coef"} is attached to this vector.  \code{"coef"} contains the
  raw-variable coefficients.
  }
\author{Frank Harrell}
\seealso{\code{\link{prcomp}}}
\examples{
set.seed(1)
x1 <- rnorm(100)
x2 <- x1 + rnorm(100)
w <- pc1(cbind(x1,x2))
attr(w,'coef')
}
\keyword{multivariate}

\eof
\name{plotCorrPrecision}
\alias{plotCorrPrecision}
\title{Plot Precision of Estimate of Pearson Correlation Coefficient}
\description{
This function plots the precision (margin of error) of the
product-moment linear 
correlation coefficient r vs. sample size, for a given vector of
correlation coefficients \code{rho}.  Precision is defined as the larger
of the upper confidence limit minus rho and rho minus the lower confidence
limit.  \code{labcurve} is used to automatically label the curves.
}
\usage{
plotCorrPrecision(rho = c(0, 0.5), n = seq(10, 400, length = 100),
                  conf.int = 0.95)
}
\arguments{
  \item{rho}{single or vector of true correlations.  A worst-case
	precision graph results from rho=0}
  \item{n}{vector of sample sizes to use on the x-axis}
  \item{conf.int}{confidence coefficient; default uses 0.95 confidence limits}
}
\author{Xing Wang and Frank Harrell}
\seealso{\code{\link{rcorr}},\code{\link{cor}},\code{\link{cor.test}}}
\examples{
plotCorrPrecision()
plotCorrPrecision(rho=0)
}
\keyword{htest}

\eof
\name{plsmo}
\alias{plsmo}
\alias{panel.plsmo}
\title{
Plot smoothed estimates
}
\description{

  Plot smoothed estimates of x vs. y, handling missing data for lowess
  or supsmu, and adding axis labels.  Optionally suppresses plotting
  extrapolated estimates.  An optional \code{group} variable can be
  specified to compute and plot the smooth curves by levels of
  \code{group}.  When \code{group} is present, the \code{datadensity}
  option will draw tick marks showing the location of the raw
  \code{x}-values, separately for each curve.  \code{plsmo} has an
  option to plot connected points for raw data, with no smoothing.

\code{panel.plsmo} is a \code{panel} function for \code{trellis} for the
\code{xyplot} function that uses \code{plsmo} and its options to draw
one or more nonparametric function estimates on each panel.  This has
advantages over using \code{xyplot} with \code{panel.xyplot} and
\code{panel.loess}: (1) by default it will invoke \code{labcurve} to
label the curves where they are most separated, (2) the
\code{datadensity} option will put rug plots on each curve (instead of a
single rug plot at the bottom of the graph), and (3) when
\code{panel.plsmo} invokes \code{plsmo} it can use the "super smoother"
(\code{supsmu} function) instead of \code{lowess}.  \code{panel.plsmo}
senses when a \code{group} variable is specified to \code{xyplot} so
that it can invoke \code{panel.superpose} instead of
\code{panel.xyplot}.  Using \code{panel.plsmo} through \code{trellis}
has some advantages over calling \code{plsmo} directly in that
conditioning variables are allowed and \code{trellis} uses nicer fonts
etc.

When a \code{group} variable was used, \code{panel.plsmo} creates a function
\code{Key} in the session frame that the user can invoke to draw a key for
individual data point symbols used for the \code{group}s.  
By default, the key is positioned at the upper right
corner of the graph.  If \code{Key(locator(1))} is specified, the key will
appear so that its upper left corner is at the coordinates of the
mouse click.
}
\usage{
plsmo(x, y, method=c("lowess","supsmu","raw"), xlab, ylab, 
      add=FALSE, lty=1:nlev, col=par("col"), lwd=par("lwd"),
      iter=if(length(unique(y))>2) 3 else 0, bass=0, trim, 
      fun, group, prefix, xlim, ylim, 
      label.curves=TRUE, datadensity=FALSE, lines.=TRUE, subset=TRUE,
      grid=FALSE, \dots)


#To use panel function:
#xyplot(formula=y ~ x | conditioningvars, groups,
#       panel=panel.plsmo, type='b', 
#       label.curves=TRUE,
#       lwd = superpose.line$lwd, 
#       lty = superpose.line$lty, 
#       pch = superpose.symbol$pch, 
#       cex = superpose.symbol$cex, 
#       font = superpose.symbol$font, 
#       col = NULL, \dots)
}
\arguments{
\item{x}{
vector of x-values, NAs allowed
}
\item{y}{
vector of y-values, NAs allowed
}
\item{method}{
"lowess" (the default), "supsmu", or "raw" to not smooth at all
}
\item{xlab}{
x-axis label iff add=F.  Defaults of label(x) or argument name.
}
\item{ylab}{
y-axis label, like xlab.
}
\item{add}{
Set to T to call lines instead of plot.  Assumes axes already labeled.
}
\item{lty}{
line type, default=1,2,3,\dots, corresponding to \code{group}
}
\item{col}{
color for each curve, corresponding to \code{group}.  Default is
current \code{par("col")}. 
}
\item{lwd}{
vector of line widths for the curves, corresponding to \code{group}.
Default is current \code{par("lwd")}. 
\code{lwd} can also be specified as an element of \code{label.curves} if
\code{label.curves} is a list.
}
\item{iter}{
iter parameter if method="lowess", default=0 if \code{y} is binary, and 3 otherwise.
}
\item{bass}{
bass parameter if method="bass", default=0.
}
\item{trim}{
only plots smoothed estimates between trim and 1-trim quantiles
of x.  Default is to use 10th smallest to 10th largest x in the group if the number of observations in the group exceeds 200 (0 otherwise).
Specify trim=0 to plot over entire range.
}
\item{fun}{
after computing the smoothed estimates, if \code{fun} is given the y-values
are transformed by \code{fun()}
}
\item{group}{
a variable, either a \code{factor} vector or one that will be converted to
\code{factor} by \code{plsmo}, that is used to stratify the data so that separate
smooths may be computed
}
\item{prefix}{
a character string to appear in group of group labels.  The presence of
\code{prefix} ensures that \code{labcurve} will be called even when \code{add=TRUE}.
}
\item{xlim}{
a vector of 2 x-axis limits.  Default is observed range.
}
\item{ylim}{
a vector of 2 y-axis limits.  Default is observed range.
}
\item{label.curves}{
set to \code{FALSE} to prevent \code{labcurve} from being called to label multiple
curves corresponding to \code{group}s.  Set to a list to pass options to
\code{labcurve}.  \code{lty} and \code{col} are passed to \code{labcurve} automatically.
}
\item{datadensity}{
set to \code{TRUE} to draw tick marks on each curve, using x-coordinates
of the raw data \code{x} values.  This is done using \code{scat1d}.
}
\item{lines.}{
set to \code{FALSE} to suppress smoothed curves from being drawn.  This can
make sense if \code{datadensity=TRUE}.
}
\item{subset}{
a logical or integer vector specifying a subset to use for processing,
with respect too all variables being analyzed
}
\item{grid}{
  set to \code{TRUE} if the \R \code{grid} package drew the current plot}
\item{\dots}{
  optional arguments that are passed to \code{scat1d},
  or optional parameters to pass to \code{plsmo} from
  \code{panel.plsmo}.  See optional arguments for \code{plsmo} above.
}
\item{type}{
set to \code{p} to have \code{panel.plsmo} plot points (and not call \code{plsmo}), 
\code{l} to call \code{plsmo} and not plot points, or use the default \code{b} to plot both.
}
\item{pch}{}
\item{cex}{}
\item{font}{
vectors of graphical parameters corresponding to the \code{group}s (scalars
if \code{group} is absent).  By default, the parameters set up by
\code{trellis} will be used.
}
}
\value{
\code{plsmo} returns a list of curves (x and y coordinates) that was passed to \code{labcurve}
}
\section{Side Effects}{
plots, and \code{panel.plsmo} creates the \code{Key} function in the session frame.
}
\seealso{
\code{\link{lowess}}, \code{\link{supsmu}}, \code{\link{label}}, \code{\link{quantile}}, \code{\link{labcurve}}, \code{\link{scat1d}},
\code{\link[lattice]{xyplot}}, \code{\link{panel.superpose}}, \code{\link[lattice]{panel.xyplot}}
}
\examples{
set.seed(1)
x <- 1:100
y <- x + runif(100, -10, 10)
plsmo(x,y,"supsmu",xlab="Time of Entry") 
#Use label(y) or "y" for ylab


plsmo(x,y,add=TRUE,lty=2)
#Add lowess smooth to existing plot, with different line type


age <- rnorm(500, 50, 15)
survival.time <- rexp(500)
sex <- sample(c('female','male'), 500, TRUE)
race <- sample(c('black','non-black'), 500, TRUE)
plsmo(age, survival.time < 1, fun=qlogis, group=sex) # plot logit by sex


#Plot points and smooth trend line using trellis 
# (add type='l' to suppress points or type='p' to suppress trend lines)
if(.R.) library(lattice)
xyplot(survival.time ~ age, panel=panel.plsmo)


#Do this for multiple panels
xyplot(survival.time ~ age | sex, panel=panel.plsmo)


#Do this for subgroups of points on each panel, show the data
#density on each curve, and draw a key at the default location
xyplot(survival.time ~ age | sex, groups=race, panel=panel.plsmo,
       datadensity=TRUE)
Key()


#Use wloess.noiter to do a fast weighted smooth
plot(x, y)
lines(wtd.loess.noiter(x, y))
lines(wtd.loess.noiter(x, y, weights=c(rep(1,50), 100, rep(1,49))), col=2)
points(51, y[51], pch=18)   # show overly weighted point
#Try to duplicate this smooth by replicating 51st observation 100 times
lines(wtd.loess.noiter(c(x,rep(x[51],99)),c(y,rep(y[51],99)),
      type='ordered all'), col=3)
#Note: These two don't agree exactly
}
\keyword{smooth}
\keyword{nonparametric}
\keyword{hplot}
\concept{trellis}
\concept{lattice}



\eof
\name{popower}
\alias{popower}
\alias{posamsize}
\alias{print.popower}
\alias{print.posamsize}
\title{Power and Sample Size for Ordinal Response}
\description{
\code{popower} computes the power for a two-tailed two sample comparison
of ordinal outcomes under the proportional odds ordinal logistic
model.  The power is the same as that of the Wilcoxon test but with
ties handled properly.  \code{posamsize} computes the total sample size
needed to achieve a given power.  Both functions compute the efficiency
of the design compared with a design in which the response variable
is continuous.  \code{print} methods exist for both functions.  Any of the
input arguments may be vectors, in which case a vector of powers or
sample sizes is returned.  These functions use the methods of
Whitehead (1993).
}
\usage{
popower(p, odds.ratio, n, n1, n2, alpha=0.05)
\method{print}{popower}(x, \dots)
posamsize(p, odds.ratio, fraction=.5, alpha=0.05, power=0.8)
\method{print}{posamsize}(x, \dots)
}
\arguments{
\item{p}{
a vector of marginal cell probabilities which must add up to one.
The \code{i}th element specifies the probability that a patient will be in response level
\code{i}, averaged over the two treatment groups.
}
\item{odds.ratio}{
the odds ratio to be able to detect.  It doesn't
matter which group is in the numerator.
}
\item{n}{
total sample size for \code{popower}.  You must specify either \code{n} or
\code{n1} and \code{n2}.  If you specify \code{n}, \code{n1} and \code{n2} are set to \code{n/2}.
}
\item{n1}{
for \code{popower}, the number of subjects in treatment group 1
}
\item{n2}{
for \code{popower}, the number of subjects in group 2
}
\item{alpha}{
type I error
}
\item{x}{an object created by \code{popower} or \code{posamsize}}
\item{fraction}{
for \code{posamsize}, the fraction of subjects that will be allocated to group 1
}
\item{power}{
for \code{posamsize}, the desired power (default is 0.8)
}
\item{\dots}{unused}
}
\value{
a list containing \code{power} and \code{eff} (relative efficiency) for \code{popower},
or containing \code{n} and \code{eff} for \code{posamsize}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Whitehead J (1993): Sample size calculations for ordered categorical
data.  Stat in Med 12:2257--2271.


Julious SA, Campbell MJ (1996): Letter to the Editor.  Stat in Med 15:
1065--1066.  Shows accuracy of formula for binary response case.
}
\seealso{
\code{\link{bpower}}, \code{\link{cpower}}
}
\examples{
#For a study of back pain (none, mild, moderate, severe) here are the
#expected proportions (averaged over 2 treatments) that will be in
#each of the 4 categories:


p <- c(.1,.2,.4,.3)
popower(p, 1.2, 1000)   # OR=1.2, total n=1000
posamsize(p, 1.2)
popower(p, 1.2, 3148)
}
\keyword{htest}
\keyword{category}
\concept{power}
\concept{study design}
\concept{ordinal logistic model}
\concept{ordinal response}
\concept{proportional odds model}

\eof
\name{print.char.matrix}
\alias{print.char.matrix}
%- Also NEED an `\alias' for EACH other topic documented here.
\title{ Function to print a matrix with stacked cells }
\description{
   Prints a dataframe or matrix in stacked cells.  Line break charcters
   in a matrix element will result in a line break in that cell, but tab
   characters are not supported.
}
\usage{
print.char.matrix(x, file = "", col.name.align = "cen", col.txt.align = "right", 
    cell.align = "cen", hsep = "|", vsep = "-", csep = "+", row.names = TRUE, 
    col.names = FALSE, append = FALSE,
    top.border = TRUE, left.border = TRUE, \dots)
}

\arguments{
  \item{x}{a matrix or dataframe}
  \item{file}{name of file if file output is desired.  If left empty,
    output will be to the screen}
  \item{col.name.align}{if column names are used, they can be aligned
    right, left or centre. Default \code{"cen"} results in names centred
    between the sides of the columns they name. If the width of the text
    in the columns is less than the width of the name, \code{col.name.align}
    will have no effect. Other options are \code{"right"} and \code{"left"}.}
  \item{col.txt.align}{how character columns are aligned.  Options
    are the same as for \code{col.name.align} with no effect when the width of
    the column is greater than its name.}
  \item{cell.align}{how numbers are displayed in columns}
  \item{hsep}{character string to use as horizontal separator,
    i.e. what separates columns}
  \item{vsep}{character string to use as vertical separator,
    i.e. what separates rows.  Length cannot be more than one.}

  \item{csep}{character string to use where vertical and horizontal
    separators cross.  If \code{hsep} is more than one character,
    \code{csep} will need to be the same length.  There is no provision
    for multiple vertical separators}

  \item{row.names}{logical: are we printing the names of the rows?}
  \item{col.names}{logical: are we printing the names of the columns?}
  \item{append}{logical: if {file} is not \code{""}, are we appending to
    the file or overwriting?}
  \item{top.border}{logical: do we want a border along the top above the
    columns?}
  \item{left.border}{logical: do we want a border along the left of the
    first column?}
  \item{\dots}{unused}
  }

\details{
  If any column of \code{x} is a mixture of character and numeric, the
  distinction between character and numeric columns will be lost. This
  is especially so if the matrix is of a form where you would not want
  to print the column names, the column information being in the rows at
  the beginning of the matrix.

  Row names, if not specified in the making of the matrix will simply be
  numbers. To prevent printing them, set \code{row.names = FALSE}.}

\value{
  No value is returned.  The matrix or dataframe will be printed to file
  or to the screen.
}
\author{Patrick Connolly \email{p.connolly@hortresearch.co.nz}}

\seealso{\code{write},  \code{write.table}}

\examples{
data(HairEyeColor)
print.char.matrix(HairEyeColor[ , , "Male"], col.names = TRUE)
print.char.matrix(HairEyeColor[ , , "Female"], col.txt.align = "left", col.names = TRUE)


z <- rbind(c("", "N", "y"),
           c("[ 1.34,40.3)\n[40.30,48.5)\n[48.49,58.4)\n[58.44,87.8]",
             " 50\n 50\n 50\n 50",
             "0.530\n0.489\n0.514\n0.507"),
           c("female\nmale", " 94\n106", "0.552\n0.473"  ),
           c("", "200", "0.510"))
dimnames(z) <- list(c("", "age", "sex", "Overall"),NULL)

print.char.matrix(z)
}
\keyword{print}
\keyword{array}

\eof
\name{prnz}
\alias{prn}
\title{
Print and Object with its Name
}
\description{
Prints an object with its name and with an optional descriptive
text string.  This is useful for annotating analysis output files and
for debugging.
}
\usage{
prn(x, txt)
}
\arguments{
\item{x}{
any object
}
\item{txt}{
optional text string
}}
\section{Side Effects}{
prints
}
\seealso{
\code{\link{print}}, \code{\link{cat}}
}
\examples{
x <- 1:5
prn(x)
# prn(fit, 'Full Model Fit')
}
\keyword{print}

\eof
\name{ps.slide}
\alias{ps.slide}
\alias{setps}
\alias{setpdf}
\alias{topdf}
\alias{tex}
\alias{showPsfrag}
\title{
Postscript and Adobe PDF Setup for 35mm Slides and Other Formats
}
\description{

The \code{ps.slide} function has nice defaults to create postscript
images with larger font, thicker lines, and better axis labeling.  These
images can be used to make nice slides.  There is an option to view the
constructed postscript file using \code{ghostview}, and an option to
initiate a background process to convert the postscript file to a PC
Paintbrush \code{.pcx} file for importing into various PC presentation
graphics packages although with a significant  loss in resolution.  This
option assumes you have installed various public-domain unix image
conversion programs. You can preview \code{.pcx} files using
e.g. \code{xli file.pcx &}. Specify \code{type=1} to make nice fullsize
graphs or \code{type=3} for making 5 x 7" landscape graphs using
14-point type (useful for submitting to journals).  \code{type=2} (the
default) is for color 35mm slides. Use \code{type=4} to make nice black
and white overhead projection transparancies (portrait mode).  This uses
line thickness 4, pointsize 14, height 8, width 7.  For \code{type=3},
numbers on the y-axis are written horizontally (\code{las} defaults to
\code{1} for \code{type=3}).

\code{ps.slide} calls \code{mgp.axis.labels} in Hmisc set up axis-specific
defaults for the 2nd \code{mgp} graphical parameter.  See \code{Overview} for
Hmisc for help.  This is only used automatically for select high-level graphics
functions in Hmisc and Design, as S-Plus only supports a single
distance between tick marks and tick mark labels using \code{par}, and
when \code{las=1} a larger distance is needed for the y-axis.

See the body of the function for \code{type}-specific default values for many
of the parameters.  This function has not been tested for color output on
Windows systems.

\code{setps} is a function that makes small postscript plots with minimal
surrounding white space, suitable for inclusion in books and reports.
Internally \code{setps} uses (and defines) the \code{psfig} function by
Antonio Possolo (antonio@atc.boeing.com).  \code{setps} is especially good
for including plots in LaTeX.  \code{setps} creates a temporary function in the
session database that when invoked will convert a completed postscript
graphics file to a Adobe Acrobat .pdf if you have Ghostscript
installed and in your path (so that the \code{gs} command is available in
UNIX or \code{gswin32c} is available for Windows/NT).  Invoke \code{topdf} by
the command \code{topdf()}, or, if you want to convert a graphic other than
the last one created, run \code{topdf(filename)} to convert \code{filename.ps}
to \code{filename.pdf}.  If \code{trellis=TRUE}, \code{setps} invokes \code{trellis.device}
with a \code{postscript} device argument, and it does not set any of the
\code{par} parameters.  Arguments
3, 4, 5, 7, 9, and 10 to \code{setps} are ignored if \code{trellis=TRUE}.  If
\code{options(setpsPrefix="name")} is defined, the \code{"name"} string 
will be prefixed to the file name used by \code{setps}.  \code{setpdf} uses a
similar option \code{setpdfPrefix}.  \code{setps} and \code{setpdf} set
\code{par(mgp=c(2,0.4,0))} if \code{trellis=FALSE}.

\code{setpdf} is similar to \code{setps} but for making Adobe Acrobat PDF
graphics files directly.  There are a few problems with the S-Plus
\code{pdf.graph} function used by \code{setpdf}, though: (1) the default for
points (open circle) is too large, (2) graphs are not centered
properly, (3) gray scale does not work, and (4) there is some wasted
space at the bottom of the graph.  When drawing points, the user may
want to specify \code{cex=0.7}.  It may be better to use \code{setps} followed
by \code{topdf()}.

\code{tex} is a little function to save typing when including \code{\tex}
commands in graphs that are used with the psfrag package in LaTeX to
typeset any LaTeX text inside a postscript graphic.  \code{tex} surrounds
the input character string with \\tex[options]\{\}.  This is especially
useful for getting Greek letters and math symbols in postscript
graphs.  By default \code{tex} returns a string with \code{psfrag} commands
specifying that the string be centered, not rotated, and not specially
enlarged or shrunk.

\code{showPsfrag} is used to display (using ghostview) a postscript image
that contained psfrag LaTeX strings, by building a small LaTeX script
and running latex and dvips.
}
\usage{
ps.slide(file, background = if (type != 2) "white" else "navy blue",
         foreground = if (type == 2) "yellow" else
                     (if(background == "white") "black" else "white"),
         font = "Helvetica", pointsize = c(24, 28, 14, 14)[type],
         hor = type != 4, lwd = c(2, 5, 2, 4)[type],
         mgp = if(under.unix) list(c(1.8, 0.4, 0), c(1.5, 0.2, 0),
                 c(2, 0.4, 0), c(1.5, 0.2, 0))[[type]] else
                 list(c(1.8, 0.5, 0), c(1.5, 0.4, 0), c(2, 0.5, 0),
                 c(1.5, 0.4, 0))[[type]],
         mar = list(c(4, 3, 2, 1) + 0.1, c(5, 4, 2.25, 2) + 0.1,
                 c(3, 3, 1, 1) + 0.1, c(5, 4, 2.25, 2) + 0.1)[[type]],
         pch = 202, view = FALSE, pcx = FALSE, tiff = FALSE,
         close = view | pcx | tiff, bty = "l",
         type = 2, height = switch(type, NULL, NULL, 5, 8),
         width = switch(type, NULL, NULL, 7, 7),
         tck = if (type == 3 || !under.unix) -0.013 else par("tck"),
         las = if (type == 3) 1 else 0, eps =  FALSE, ...)

setps(filename, w=0, h=3, pointsize=10, sublines=0, toplines=0,
      type="symbol", lwd=2, font="Helvetica",
      leftlines=0, las=1,
      trellis=!(missing(setTrellis.) & missing(strip.blank) &
                missing(lty.dot.line) & missing(lwd.dot.line)), 
      setTrellis.=TRUE, 
      strip.blank =TRUE, lty.dot.line = 1, lwd.dot.line = 1,
      seqno=NULL, color=FALSE)


setpdf(filename, w=0, h=4, pointsize=10, sublines=0, toplines=0,
       type="symbol", lwd=1.5, font=if(.R.)"Helvetica" else 1,
       ratio= if(.R.) 4/3 else (1 + sqrt(5))/2,
       leftlines=0, las=1, bty='l', hor=FALSE, 
       trellis=!(missing(setTrellis.) & missing(strip.blank) &
                 missing(lty.dot.line) & missing(lwd.dot.line)), 
       setTrellis.=TRUE, 
       strip.blank =TRUE, lty.dot.line = 1, lwd.dot.line =1,
       region=c(0, 0, h, w), color=FALSE, seqno=NULL, \dots)


tex(string, lref='c', psref='c', scale=1, srt=0)


showPsfrag(filename)
}
\arguments{
\item{file}{
}
\item{filename}{
character string specifying file prefix.  For \code{setps} or \code{setpdf} omit
surrounding quotes unless \code{type="char"}.
}
\item{string}{
a character string to be processed by \code{psfrag} in LaTeX.
}
\item{background}{
default is yellow on navy blue background (black on white for \code{type=1,3}.  
\code{background} may also be
set to any legitimate background color listed in the S-supplied object
\code{ps.colors.rgb}.
}
\item{foreground}{
foreground color.  See \code{background} for allowable values.
}
\item{font}{
font for text.  Replaces the first font in the standard list of fonts
in \code{ps.options("fonts")}.  If \code{font="Times-Roman"}, the fifth font
(normally \code{Helvetica-Bold}) is set to \code{Times-Bold}.  For \code{setpdf},
\code{font} is a number, and the default is \code{1} for Helvetica.   All
default fonts are Helvetica for \code{setps}, \code{psfig}, and \code{ps.slide}.
}
\item{pointsize}{
postscript point size. Set to a larger number if using multiple plots
via \code{par(mfrow=)}.}
\item{hor}{
default is \code{TRUE} to make a horizontal graph
}
\item{lwd}{
line width
}
\item{mgp}{
see \code{par}.  Defaults are chosen according to \code{type}.
}
\item{mar}{
margins (see \code{par})
}
\item{pch}{
see \code{par}
}
\item{view}{
set to \code{TRUE} to initiate a \code{ghostview} run to view the postscript file.
This option will also close out the postscript file (this is done before
viewing).  If you have an active \code{ghostview} window for this file already,
you can just type \code{graphics.off()} or \code{dev.off()} to re-create the \code{.ps} file.
\code{ghostview} will then update the image automatically.
}
\item{pcx}{
set to \code{TRUE} to initiate conversion to \code{pcx} format.  Also implies
\code{close=TRUE}.
}
\item{tiff}{
set to \code{TRUE} to initiate conversion to \code{tiff} format.  Also implies
\code{close=TRUE}.
}
\item{close}{
set to \code{TRUE} to finish construction of the postscript file.
}
\item{bty}{
box type surrounding graph.  Default is \code{"l"} for \code{"L"} shape.  Use
\code{"c"} for complete box.
}
\item{type}{
set \code{type=1} to use black on white background, smaller pointsize,
and other settings that are good for making
overhead transparencies and graphs to include in reports.  Set \code{type=3}
for 5" x 7" landscape plots, and \code{time=4} for overheads.
For \code{setps} and \code{setpdf}, specifies whether \code{filename} is
quoted or not.
}
\item{height}{
defaults to 5 if \code{type=3}, otherwise no default (except for type=4)
}
\item{width}{
defaults to 7 if \code{type=3}, otherwise no default (except for type=4)
}
\item{tck}{
length of tick marks.  See \code{par}.
}
\item{las}{
set to \code{0} to have axis labels always parallel to the axis, \code{1} for
always horizontal, \code{2} for perpendicular to axis
}
\item{eps}{
set to \code{TRUE} if you are going to be importing the postscript file to a
system that really cares that it is marked to officially be encapsulated
postscript.  If you set \code{eps=TRUE}, you may put only one figure in the file
(see the \code{onefile} argument in \code{postscript}).  This applies to UNIX systems
only.
}
\item{...}{
other arguments to \code{ps.options} (or \code{postscript} for Windows or
\code{pdf.graph} for \code{setpdf})
}
\item{w}{
width of plot.  Default is chosen to scale nicely to \code{h} for a
landscape plot
}
\item{h}{
height of plot (default is 3in)
}
\item{sublines}{
number of lines to reserve for subtitles
}
\item{toplines}{
number of lines to reserve for main title
}
\item{leftlines}{
number of lines to reserve for left margin
}
\item{trellis}{
set to \code{TRUE} to set up for postscript output for Trellis graphics.  This
makes \code{trellis.device("postscript", \dots)} be called instead of
\code{postscript(\dots)} directly, and leaves \code{par} parameters at defaults.
}
\item{setTrellis.}{
set to \code{FALSE} to prevent \code{setTrellis} from being called to set the
strip panel background and to set characteristics for dot plot
reference lines
}
\item{strip.blank}{
set to \code{FALSE} to keep shading in conditioning variable panel titles, if
\code{setTrellis.=TRUE}
}
\item{lty.dot.line}{
if \code{setTrellis.=TRUE}, the line type for dot plot reference lines (default = solid line)
}
\item{lwd.dot.line}{
if \code{setTrellis.=TRUE}, the line width for dot plot reference lines
(default = 1)
}
\item{seqno}{
if non-null, pastes the value of \code{seqno} at the end of the base of the
file name, for \code{setps} and \code{setpdf}
}
\item{color}{
set \code{color=TRUE} to use a color Trellis device instead of default of
black and white, for \code{setps}.  For \code{setpdf} set to \code{TRUE}
to get color pdf graphics.
}
\item{region}{
see \code{pdf.graph}.  Default is to use an image region that is just large
enough to contain the graphic.
}
\item{ratio}{
ratio of width to height of the plot when only one of those is
specified.  Defaults depend on whether S-Plus or R are being used.
}
\item{lref}{
LaTeX reference point for \code{string}.  See the \code{psfrag} documentation
referenced below.  Default is \code{"c"} for centered (this is also the
default for \code{psref}).
}
\item{psref}{
PostScript reference point.
}
\item{scale}{
scall factor, default is 1
}
\item{srt}{
rotation for \code{string} in degrees (default is zero)
}}
\value{
nothing, for most of the functions.  \code{tex} returns a modified
character string.
}
\section{Side Effects}{
Starts a postscript file or a process to convert it to pcx format, or
starts a Trellis postscript device.
\code{ps.slide} Stores a system option \code{ps.slide.file}.  \code{pdf.graph} opens
a graphics file using \code{pdf.graph}.  \code{setps} creates a function \code{topdf}
in frame 0 (the session database).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\references{
Grant MC, Carlisle (1998): The PSfrag System, Version 3.  Full
documentation is obtained by searching www.ctan.org for pfgguide.ps.
}
\seealso{
\code{\link{postscript}}, \code{\link{par}}, \code{\link{ps.options}},  
\code{\link{mgp.axis.labels}}, \code{\link{pdf}}, \code{\link{trellis.device}}, \code{\link{setTrellis}}
}
\examples{
\dontrun{
ps.slide("myslide")   # myslide is file name prefix
# use ps.slide("myslide",back="green") to use e.g. green background
plot(x, y)
title("My Title")


ps.slide(view=TRUE)   # makes myslide.ps file
                   # use ps.slide(close=TRUE) to close file without viewing with
                   # ghostview.
ps.slide(view=TRUE, pcx=TRUE)
                   # converts myslide.ps into myslide.pcx (PC Paintbrush
                   # format suitable for importing in PC graphics packages)
mgp.axis.labels(c(.4,1.2))  # override 2nd mgp parameters for x- and y axes
mgp.axis.labels(type='x')   # retrieve 3 mgp parameters for x-axis


setps(myfile)      # equiv. to setps('myfile', type='char')
                   # setps(myfile, trellis=TRUE, other args) for Trellis
# plotting commands
dev.off()
topdf()            # topdf created by setps
                   # makes Ghostscript create "myfile.pdf"
setpdf(myfile)
# plotting commands
dev.off()


# Put math and Greek symbols in a graph
setps(test)
x <- seq(0,15,length=100)
plot(x, dchisq(x, 5), xlab=tex('$x$'),
        ylab=tex('$f(x)$'), type='l')
title(tex('Density Function of the $\\chi_{5}^{2}$ Distribution'))
dev.off()
# To process this file in LaTeX do something like
#\documentclass{article}
#\usepackage[scanall]{psfrag}
#\begin{document}
#\begin{figure}
#\includegraphics{test.ps}
#\caption{This is an example}
#\end{figure}
#\end{document}
}
}
\keyword{hplot}
\keyword{device}
\concept{trellis}
\concept{lattice}

\eof
\name{pstamp}
\alias{pstamp}
\title{Date/Time/Directory Stamp the Current Plot}
\description{
Date-time stamp the current plot in the extreme lower right
corner. Optionally add the current working directory and arbitrary other
text to the stamp.
}
\usage{
pstamp(txt, pwd = FALSE, time. = TRUE)
}
\arguments{
  \item{txt}{an optional single text string}
  \item{pwd}{set to \code{TRUE} to add the current working directory
	name to the stamp}
  \item{time.}{set to \code{FALSE} to use the date without the time}
}
\details{
  Certain functions are not supported for S-Plus under Windows.  For \R,
  results may not be satisfactory if \code{par(mfrow=)} is in effect.
}
\author{Frank Harrell}
\examples{
plot(1:20)
pstamp(pwd=TRUE, time=FALSE)
}
\keyword{aplot}

\eof
\name{rMultinom}
\alias{rMultinom}
\title{Generate Multinomial Random Variables with Varying Probabilities}
\description{
Given a matrix of multinomial probabilities where rows correspond to
observations and columns to categories (and each row sums to 1),
generates a matrix with the same number of rows as has \code{probs} and
with \code{m} columns.  The columns represent multinomial cell numbers,
and within a row the columns are all samples from the same multinomial
distribution.  The code is a modification of that in the
\code{impute.polyreg} function in the \code{MICE} package.
}
\usage{
rMultinom(probs, m)
}
\arguments{
  \item{probs}{matrix of probabilities}
  \item{m}{number of samples for each row of \code{probs}}
}
\value{
  an integer matrix having \code{m} columns
}
\seealso{\code{\link{rbinom}}}
\examples{
set.seed(1)
w <- rMultinom(rbind(c(.1,.2,.3,.4),c(.4,.3,.2,.1)),200)
t(apply(w, 1, table)/200)
}
\keyword{distribution}


\eof
\name{rcorr}
\alias{rcorr}
\alias{print.rcorr}
\alias{spearman2}
\alias{spearman2.default}
\alias{spearman2.formula}
\alias{print.spearman2.formula}
\alias{plot.spearman2.formula}
\alias{spearman}
\alias{spearman.test}
\title{
Matrix of Correlations and Generalized Spearman Rank Correlation
}
\description{

\code{rcorr} Computes a matrix of Pearson's \code{r} or Spearman's
\code{rho} rank correlation coefficients for all possible pairs of
columns of a matrix.  Missing values are deleted in pairs rather than
deleting all rows of \code{x} having any missing variables.  Ranks are
computed using efficient algorithms (see reference 2), using midranks
for ties.

\code{spearman2} computes the square of Spearman's rho rank correlation
and a generalization of it in which \code{x} can relate
non-monotonically to \code{y}.  This is done by computing the Spearman
multiple rho-squared between \code{(rank(x), rank(x)^2)} and \code{y}.
When \code{x} is categorical, a different kind of Spearman correlation
used in the Kruskal-Wallis test is computed (and \code{spearman2} can do
the Kruskal-Wallis test).  This is done by computing the ordinary
multiple \code{R^2} between \code{k-1} dummy variables and
\code{rank(y)}, where \code{x} has \code{k} categories.  \code{x} can
also be a formula, in which case each predictor is correlated separately
with \code{y}, using non-missing observations for that predictor.
\code{print} and \code{plot} methods allow one to easily print or plot
the results of \code{spearman2(formula)}.  The adjusted \code{rho^2} is
also computed, using the same formula used for the ordinary adjusted
\code{R^2}.  The \code{F} test uses the unadjusted R2.  For \code{plot},
a dot chart is drawn which by default shows, in sorted order, the
adjusted \code{rho^2}.

\code{spearman} computes Spearman's rho on non-missing values of two
variables.  \code{spearman.test} is a simple version of \code{spearman2.default}.
}
\usage{
rcorr(x, y, type=c("pearson","spearman"))

\method{print}{rcorr}(x, ...)

spearman2(x, ...)

\method{spearman2}{default}(x, y, p=1, minlev=0, exclude.imputed=TRUE, ...)

\method{spearman2}{formula}(x, p=1, 
          data, subset, na.action, minlev=0, exclude.imputed=TRUE, ...)

\method{print}{spearman2.formula}(x, ...)

\method{plot}{spearman2.formula}(x, what=c('Adjusted rho2','rho2','P'),
     sort.=TRUE, main, xlab, \dots)

spearman(x, y)

spearman.test(x, y, p=1)
}
\arguments{
\item{x}{
a numeric matrix with at least 5 rows and at least 2 columns (if
\code{y} is absent).  For \code{spearman2}, the first argument may be a vector
of any type, including character or factor.  The first argument may also be a
formula, in which case all predictors are correlated individually with
the response variable.  \code{x} may be a formula for \code{spearman2}
in which case \code{spearman2.formula} is invoked.  Each
predictor in the right hand side of the formula is separately correlated
with the response variable.  For \code{print}, \code{x} is an object
produced by \code{rcorr} or \code{spearman2}. For \code{plot}, \code{x}
is a result returned by \code{spearman2}.  For \code{spearman} and
\code{spearman.test} \code{x} is a numeric vector, as is \code{y}.
}
\item{type}{
specifies the type of correlations to compute.  Spearman correlations
are the Pearson linear correlations computed on the ranks of non-missing
elements, using midranks for ties.
}
\item{y}{
a numeric vector or matrix which will be concatenated to \code{x}.  If
\code{y} is omitted for \code{rcorr}, \code{x} must be a matrix.
}
\item{p}{
for numeric variables, specifies the order of the Spearman \code{rho^2} to
use.  The default is \code{p=1} to compute the ordinary \code{rho^2}.  Use \code{p=2}
to compute the quadratic rank generalization to allow
non-monotonicity.  \code{p} is ignored for categorical predictors. 
}
\item{data, subset, na.action}{
the usual options for models.  Default for \code{na.action} is to retain
all values, NA or not, so that NAs can be deleted in only a pairwise
fashion.
}
\item{minlev}{
minimum relative frequency that a level of a categorical predictor
should have before it is pooled with other categories (see
\code{combine.levels}) in \code{spearman2}.  The default, \code{minlev=0} causes no pooling.
}
\item{exclude.imputed}{
set to \code{FALSE} to include imputed values (created by \code{impute}) in the calculations.
}
\item{what}{
specifies which statistic to plot
}
\item{sort.}{
set \code{sort.=FALSE} to suppress sorting variables by the statistic being plotted
}
\item{main}{
main title for plot.  Default title shows the name of the response
variable.
}
\item{xlab}{
x-axis label.  Default constructed from \code{what}.
}
\item{...}{
other arguments that are passed to \code{dotchart2}
}}
\value{
\code{rcorr} returns a list with elements \code{r}, the
matrix of correlations, \code{n} the
matrix of number of observations used in analyzing each pair of variables,
and \code{P}, the asymptotic P-values.
Pairs with fewer than 2 non-missing values have the r values set to NA.
The diagonals of \code{n} are the number of non-NAs for the single variable
corresponding to that row and column.  \code{spearman2.default} (the
function that is called for a single \code{x}, i.e., when there is no
formula) returns a vector of statistics for the variable.
\code{spearman2.formula} returns a matrix with rows corresponding to
predictors.
}
\details{
Uses midranks in case of ties, as described by Hollander and Wolfe.
P-values are approximated by using the \code{t} distribution.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\references{
Hollander M. and Wolfe D.A. (1973).  Nonparametric Statistical Methods.
New York: Wiley.

Press WH, Flannery BP, Teukolsky SA, Vetterling, WT (1988): Numerical
Recipes in C.  Cambridge: Cambridge University Press.
}
\seealso{
\code{\link{hoeffd}}, \code{\link{cor}}, \code{\link{combine.levels}}, \code{\link{varclus}}, \code{\link{dotchart2}}, \code{\link{impute}}
}
\examples{
x <- c(-2, -1, 0, 1, 2)
y <- c(4,   1, 0, 1, 4)
z <- c(1,   2, 3, 4, NA)
v <- c(1,   2, 3, 4, 5)
rcorr(cbind(x,y,z,v))

spearman2(x, y)
plot(spearman2(z ~ x + y + v, p=2))
}
\keyword{nonparametric}
\keyword{htest}





\eof
\name{rcorr.cens}
\alias{rcorr.cens}
\title{
Rank Correlation for Censored Data
}
\description{
Computes the \code{c} index and the corresponding
generalization of Somers' Dxy rank correlation for a censored response
variable. Also works for uncensored and binary responses, 
although its use of all possible pairings
makes it slow for this purpose.
}
\usage{
rcorr.cens(x, S, outx=FALSE)
}
\arguments{
\item{x}{
a numeric predictor variable
}
\item{S}{
an \code{Surv} object or a vector.  If a vector, assumes that every
observation is uncensored.
}
\item{outx}{
set to \code{TRUE} to not count pairs of observations tied on \code{x} as a
relevant pair.  This results in a Goodman--Kruskal gamma type rank
correlation.
}}
\value{
 a vector with the following named elements:
\code{C Index, Dxy, S.D., n, missing, uncensored, Relevant Pairs, Concordant},
\code{Uncertain}
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
somers2
}
\examples{
set.seed(1)
x <- round(rnorm(200))
y <- rnorm(200)
rcorr.cens(x, y, outx=TRUE)   # can correlate non-censored variables
if(.R.) library(survival)
age <- rnorm(400, 50, 10)
d.time <- rexp(400)
cens   <- runif(400,.5,2)
death  <- d.time <= cens
d.time <- pmin(d.time, cens)
rcorr.cens(age, Surv(d.time, death))
}
\keyword{survival}
% Converted by Sd2Rd version 1.21.

\eof
\name{rcorrp.cens}
\alias{rcorrp.cens}
\title{
Rank Correlation for Paired Predictors with a Censored Response
}
\description{
Computes U-statistics to test for whether predictor X1 is more concordant
than predictor X2, extending rcorr.cens.  For method=1, estimates the fraction of
pairs for which the x1 difference is more impressive than the x2 difference.
For method=2, estimates the fraction of pairs for which x1 is concordant with
S but x2 is not.
}
\usage{
rcorrp.cens(x1, x2, S, outx=FALSE, method=1)
}
\arguments{
\item{x1}{
first predictor
}
\item{x2}{
second predictor
}
\item{S}{
a possibly right-censored \code{Surv} object.  If \code{S} is a vector instead,
it is converted to a \code{Surv} object and it is assumed that no observations
are censored.
}
\item{outx}{
set to T to exclude pairs tied on x1 or x2 from consideration
}
\item{method}{
see above
}}
\value{
a vector of statistics
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{rcorr.cens}}, \code{\link{somers2}}, \code{\link{Surv}}
}
\examples{
set.seed(1)
if(.R.) library(survival)
x1 <- rnorm(400)
x2 <- x1 + rnorm(400)
d.time <- rexp(400) + (x1 - min(x1))
cens   <- runif(400,.5,2)
death  <- d.time <= cens
d.time <- pmin(d.time, cens)
rcorrp.cens(x1, x2, Surv(d.time, death))
# rcorrp.cens(x1, x2, y) ## no censoring
}
\keyword{survival}
\keyword{nonparametric}
% Converted by Sd2Rd version 1.21.

\eof
\name{rcspline.eval}
\alias{rcspline.eval}
\title{
Restricted Cubic Spline Design Matrix
}
\description{
Computes matrix that expands a single variable into the terms needed to
fit a restricted cubic spline (natural spline) function using the
truncated power basis. Two normalization options are given for somewhat
reducing problems of ill-conditioning.  The antiderivative function can
be optionally created. If knot locations are not given, they will be
estimated from the marginal distribution of \code{x}.
}
\usage{
rcspline.eval(x, knots, nk=5, inclx=FALSE, knots.only=FALSE, 
              type="ordinary", norm=2, rpm=NULL)
}
\arguments{
\item{x}{
a vector representing a predictor variable
}
\item{knots}{
knot locations. If not given, knots will be estimated using default
quantiles of \code{x}. For 3-5 knots, the outer quantiles used are .05 and .95.
For \code{nk>5}, the outer quantiles are .025 and .975. The knots are
equally spaced between these on the quantile scale. For fewer than 100
non-missing values of \code{x}, the outer knots are the 5th smallest and
largest \code{x}.
}
\item{nk}{
number of knots. Default is 5. The minimum value is 3.
}
\item{inclx}{
set to \code{TRUE} to add \code{x} as the first column of the returned matrix
}
\item{knots.only}{
return the estimated knot locations but not the expanded matrix
}
\item{type}{
\code{"ordinary"} to fit the function, \code{"integral"} to fit its anti-derivative.
}
\item{norm}{
\code{0} to use the terms as originally given by Devlin and Weeks (1986),
\code{1} to normalize non-linear terms by the cube of the spacing between the last two
knots, \code{2} to normalize by the square of the spacing between the first
and last knots (the default).
\code{norm=2} has the advantage of making all
nonlinear terms be on the \code{x}-scale.
}
\item{rpm}{
If given, any NAs in \code{x} will be replaced with the value \code{rpm} after
estimating any knot locations.
}}
\value{
If \code{knots.only=TRUE}, returns a vector of knot locations. Otherwise returns
a matrix with \code{x} (if \code{inclx=TRUE}) followed by \code{nk-2} nonlinear terms.
The matrix has an attribute \code{knots} which is the vector of knots used.
}
\references{
Devlin TF and Weeks BJ (1986): Spline functions for logistic regression
modeling. Proc 11th Annual SAS Users Group Intnl Conf, p. 646--651.
Cary NC: SAS Institute, Inc.
}
\seealso{
\code{\link{ns}}, \code{\link{rcspline.restate}}, \code{\link[Design]{rcs}}
}
\examples{
x <- 1:100
rcspline.eval(x, nk=4, inclx=TRUE)
#lrm.fit(rcspline.eval(age,nk=4,inclx=TRUE), death)
}
\keyword{regression}
\keyword{smooth}
% Converted by Sd2Rd version 1.21.

\eof
\name{rcspline.plot}
\alias{rcspline.plot}
\title{
Plot Restricted Cubic Spline Function
}
\description{
Provides plots of the estimated restricted cubic spline function relating
a single predictor to the response for a logistic or Cox model.
The \code{rcspline.plot} function does not allow for interactions as do
\code{lrm} and \code{cph}, but it can provide detailed output for
checking spline fits. This function uses the \code{rcspline.eval},
\code{lrm.fit}, and Therneau's \code{coxph.fit} functions
and plots the estimated spline regression and confidence limits,
placing summary statistics on the graph. If there are no
adjustment variables, \code{rcspline.plot} can also plot two alternative
estimates of the regression function when \code{model="logistic"}: 
proportions or logit
proportions on grouped data, and a nonparametric estimate. The
nonparametric regression estimate is based on smoothing the binary
responses and taking the logit transformation of the smoothed
estimates, if desired. The smoothing uses \code{supsmu}.
}
\usage{
rcspline.plot(x,y,model="logistic",xrange,event,nk=5,knots=NULL,
             show="xbeta",adj=NULL,xlab,ylab,ylim,plim=c(0,1),plotcl=TRUE,
             showknots=TRUE,add=FALSE,subset,lty=1,noprint=FALSE,m,smooth=FALSE,bass=1,
             main="auto",statloc)
}
\arguments{
\item{x}{
a numeric predictor
}
\item{y}{
a numeric response. For binary logistic regression, \code{y} should be \code{0-1}.
}
\item{model}{
\code{"logistic"} or \code{"cox"}. For \code{"cox"}, uses the \code{coxph.fit} with
\code{method="efron"}.
function.
}
\item{xrange}{
range for evaluating \code{x}, default is \code{f} and \code{1-f} quantiles of \code{x},
where \code{f=10/max(n,200)}
}
\item{event}{
event/censoring indicator if \code{model="cox"}. If \code{event} is
present, \code{model} is assumed to be \code{"cox"}
}
\item{nk}{
number of knots
}
\item{knots}{
knot locations, default based on quantiles of \code{x} (by
\code{rcspline.eval})
}
\item{show}{
\code{"xbeta"} or \code{"prob"} - what is plotted on \code{y}-axis
}
\item{adj}{
optional matrix of adjustment variables
}
\item{xlab}{
\code{x}-axis label, default is \code{"label"} attribute of \code{x}
}
\item{ylab}{same for \code{y}}
\item{ylim}{
\code{y}-axis limits for logit or log hazard
}
\item{plim}{
\code{y}-axis limits for probability scale
}
\item{plotcl}{
plot confidence limits
}
\item{showknots}{
show knot locations with arrows
}
\item{add}{
add this plot to an already existing plot
}
\item{subset}{
subset of observations to process, e.g. \code{subset=sex=="male"}
}
\item{lty}{
line type for plotting estimated spline function
}
\item{noprint}{
suppress printing regression coefficients and standard
errors
}
\item{m}{
for \code{model="logistic"}, plot grouped estimates with triangles. Each
group contains \code{m} ordered observations on \code{x}.
}
\item{smooth}{
plot nonparametric estimate if \code{model="logistic"} and \code{adj} is
not specified
}
\item{bass}{
smoothing parameter (see \code{supsmu})
}
\item{main}{
main title, default is e.g. \code{"Estimated Spline Transformation"}
}
\item{statloc}{
location of summary statistics. Default positioning by
clicking left mouse button where upper left corner of statistics
should appear. Alternative is \code{"ll"} to place below the graph on the
lower left, or the actual \code{x} and \code{y} coordinates.
Use \code{"none"} to suppress statistics.
}}
\value{
list with components \code{knots, x, xbeta, lower, upper} which are respectively
the knot locations, design matrix, linear predictor, and lower and upper
confidence limits
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[Design]{lrm}}, \code{\link[Design]{cph}}, \code{\link{rcspline.eval}}, \code{\link{plot}}, \code{\link{supsmu}}, \code{\link[survival]{coxph.fit}}, \code{\link[Design]{lrm.fit}}
}
\examples{
# rcspline.plot(cad.dur, tvdlm, m=150)
# rcspline.plot(log10(cad.dur+1), tvdlm, m=150)
}
\keyword{regression}
\keyword{models}
% Converted by Sd2Rd version 1.21.

\eof
\name{rcspline.restate}
\alias{rcspline.restate}
\title{
Re-state Restricted Cubic Spline Function
}
\description{
This function re-states a restricted cubic spline function in
the un-linearly-restricted form. Coefficients for that form are
returned, along with an S functional representation of this function
and a LaTeX character representation of the function.
}
\usage{
rcspline.restate(knots, coef,
                 type=c("ordinary","integral"),
                 x="X", lx=nchar(x),
                 norm=2, columns=65, before="& &", after="\\",
                 begin="", nbegin=0, digits=max(8, .Options$digits))
}
\arguments{
\item{knots}{
vector of knots used in the regression fit
}
\item{coef}{
vector of coefficients from the fit. If the length of \code{coef} is
\code{k-1}, where \code{k=length(knots)}, the first coefficient must be
for the linear term and remaining \code{k-2} coefficients
must be for the constructed terms (e.g., from \code{rcspline.eval}).
If the length of \code{coef} is \code{k}, an intercept is assumed to be in
the first element.
}
\item{type}{
The default is to represent the cubic spline function corresponding
to the coefficients and knots.  Set \code{type="integral"} to instead represent
its anti-derivative.
}
\item{x}{
a character string to use as the variable name in the LaTeX expression
for the formula.
}
\item{lx}{
length of \code{x} to count with respect to \code{columns}. Default is length
of character string contained by \code{x}. You may want to set \code{lx}
smaller than this if it includes non-printable LaTeX commands.
}
\item{norm}{
normalization that was used in deriving the original nonlinear terms
used in the fit. See \code{rcspline.eval} for definitions.
}
\item{columns}{
maximum number of symbols in the LaTeX expression to allow before
inserting a newline (\\\\) command. Set to a very large number to
keep text all on one line.
}
\item{before}{
text to place before each line of LaTeX output. Use \code{"& &"} for an equation
array environment in LaTeX where you want to have a left-hand prefix
e.g. \code{f(X) & = &} or using \code{\\lefteqn}.
}
\item{after}{
text to place at the end of each line of output.
}
\item{begin}{
text with which to start the first line of output. Useful when adding
LaTeX output to part of an existing formula
}
\item{nbegin}{
number of columns of printable text in \code{begin}
}
\item{digits}{
number of significant digits to write for coefficients and knots
}}
\value{
a vector of coefficients. The coefficients are un-normalized
and two coefficients are added that are linearly dependent on the
other coefficients and knots. The vector of coefficients has four
attributes. \code{knots} is a vector of knots, \code{latex} is a vector of text strings
with the LaTeX representation of the formula.
\code{columns.used} is the number of columns used in the output string
since the last newline command.  \code{function} is an S function, which is
also return in character string format as the \code{text} attribute.
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{rcspline.eval}}, \code{\link{ns}}, \code{\link[Design]{rcs}}, \code{\link{latex}}, \code{\link{Function.transcan}}
}
\examples{
set.seed(1)
x <- 1:100
y <- x + rnorm(100, 0, 5)
xx <- rcspline.eval(x, inclx=TRUE, nk=4)
knots <- attr(xx, "knots")
coef <- lsfit(xx, y)$coef
options(digits=4)
# rcspline.restate must ignore intercept
w <- rcspline.restate(knots, coef[-1], x="{\\\\rm BP}")
# could also have used coef instead of coef[-1], to include intercept
cat(attr(w,"latex"), sep="\n")


xtrans <- eval(attr(w, "function"))
# This is an S function of a single argument
plot(x, xtrans(x), type="l")
# Plots fitted transformation


#x <- blood.pressure
xx.simple <- cbind(x, pmax(x-knots[1],0)^3, pmax(x-knots[2],0)^3,
                       pmax(x-knots[3],0)^3, pmax(x-knots[4],0)^3)
pred.value <- coef[1] + xx.simple \%*\% w
plot(x, pred.value, type='l')   # same as above
}
\keyword{regression}
\keyword{interface}
\keyword{character}
% Converted by Sd2Rd version 1.21.

\eof
\name{reShape}
\alias{reShape}
\title{Reshape Matrices and Serial Data}
\description{
If the first argument is a matrix, \code{reShape} strings out its values
and creates row and column vectors specifying the row and column each
element came from.  This is useful for sending matrices to Trellis
functions, for analyzing or plotting results of \code{table} or
\code{crosstabs}, or for reformatting serial data stored in a matrix (with
rows representing multiple time points) into vectors.  The number of
observations in the new variables will be the product of the number of
rows and number of columns in the input matrix.  If the first
argument is a vector, the \code{id} and \code{colvar} variables are used to
restructure it into a matrix, with NAs for elements that corresponded
to combinations of \code{id} and \code{colvar} values that did not exist in the
data.  When more than one vector is given, multiple matrices are
created.  This is useful for restructuring irregular serial data into
regular matrices.  It is also useful for converting data produced by
\code{expand.grid} into a matrix (see the last example).  The number of
rows of the new matrices equals the number of unique values of \code{id},
and the number of columns equals the number of unique values of
\code{colvar}.


A different behavior of \code{reShape} is achieved when \code{base} and \code{reps}
are specified.  In that case \code{x} must be a list or data frame, and
those data are assumed to contain one or more non-repeating
measurements (e.g., baseline measurements) and one or more repeated
measurements represented by variables named by pasting together the
character strings in the vector \code{base} with the integers 1, 2, \dots,
\code{reps}.  The input data are rearranged by repeating each value of the
baseline variables \code{reps} times and by transposing each observation's
values of one of the set of repeated measurements as \code{reps}
observations under the variable whose name does not have an integer
pasted to the end.  if \code{x} has a \code{row.names} attribute, those
observation identifiers are each repeated \code{reps} times in the output
object.  See the last example.
}
\usage{
reShape(x, \dots, id, colvar, base, reps, times=1:reps,
        timevar='seqno')
}
\arguments{
\item{x}{
a matrix or vector, or, when \code{base} is specified, a list or data frame
}
\item{...}{
other optional vectors, if \code{x} is a vector
}
\item{id}{
A numeric, character, category, or factor variable containing subject
identifiers.  Required if \code{x} is a vector, ignored otherwise.
}
\item{colvar}{
A numeric, character, category, or factor variable containing column
identifiers.  \code{colvar} is using a "time of data collection" variable.
Required if \code{x} is a vector, ignored otherwise.
}
\item{base}{
vector of character strings containing base names of repeated
measurements
}
\item{reps}{
number of times variables named in \code{base} are repeated.  This must be
a constant.
}
\item{times}{
when \code{base} is given, \code{times} is the vector of times to create
if you do not want to use consecutive integers beginning with 1.
}
\item{timevar}{
specifies the name of the time variable to create if \code{times} is
given, if you do not want to use \code{seqno}
}
}
\value{
If \code{x} is a matrix, returns a list containing the row variable, the
column variable, and the \code{as.vector(x)} vector, named the same as the
calling argument was called for \code{x}.  If \code{x} is a vector and no other
vectors were specified as \code{\dots}, the result is a matrix.  If at least
one vector was given to \code{\dots}, the result is a list containing \code{k}
matrices, where \code{k} one plus the number of vectors in \code{\dots}.  If \code{x}
is a list or data frame, the same type of object is returned.
}
\details{
In converting \code{dimnames} to vectors, the resulting variables are
numeric if all elements of the matrix dimnames can be converted to
numeric, otherwise the corresponding row or column variable remains
character.  When the \code{dimnames} if \code{x} have a \code{names} attribute, those
two names become the new variable names.  If \code{x} is a vector and
another vector is also given (in \code{\dots}), the matrices in the resulting
list are named the same as the input vector calling arguments.  You
can specify customized names for these on-the-fly by using
e.g. \code{reShape(X=x, Y=y, id= , colvar= )}.  The new names will then be
\code{X} and \code{Y} instead of \code{x} and \code{y}.   A new variable named \code{seqnno} is
also added to the resulting object.  \code{seqno} indicates the sequential
repeated measurement number.  When \code{base} and \code{times} are
specified, this new 
variable is named the character value of \code{timevar} and the values
are given by a table lookup into the vector \code{times}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{as.vector}}, \code{\link{matrix}}, \code{\link{dimnames}}, \code{\link{outer}}, \code{\link{table}}
}
\examples{
if(.R.) {
  set.seed(1)
  Solder  <- factor(sample(c('Thin','Thick'),200,TRUE),c('Thin','Thick'))
  Opening <- factor(sample(c('S','M','L'),  200,TRUE),c('S','M','L'))
} else attach(solder[solder$skips > 10, ])
tab <- table(Opening, Solder)
tab
reShape(tab)
# attach(tab)  # do further processing


if(!.R.) {
 g <- crosstabs( ~ Solder + Opening, data = solder, subset = skips > 10)
 rowpct <- 100*attr(g,'marginals')$"N/RowTotal"   # compute row pcts
 rowpct


 r <- reShape(rowpct)
 # note names "Solder" and "Opening" came originally from formula
 # given to crosstabs
 r    
 dotplot(Solder ~ rowpct, groups=Opening, panel=panel.superpose, data=r)
}


# An example where a matrix is created from irregular vectors
follow <- data.frame(id=c('a','a','b','b','b','d'),
                     month=c(1, 2,  1,  2,  3,  2),
                     cholesterol=c(225,226, 320,319,318, 270))
follow
attach(follow)
reShape(cholesterol, id=id, colvar=month)
detach('follow')
# Could have done :
# reShape(cholesterol, triglyceride=trig, id=id, colvar=month)


# Get predictions from a regression model for 2 systematically
# varying predictors.  Convert the predictions into a matrix, with
# rows corresponding to the predictor having the most values, and
# columns corresponding to the other predictor
# d <- expand.grid(x2=0:1, x1=1:100)
# pred <- predict(fit, d)
# reShape(pred, id=d$x1, colvar=d$x2)  # makes 100 x 2 matrix


# Reshape a wide data frame containing multiple variables representing
# repeated measurements (3 repeats on 2 variables; 4 subjects)
set.seed(33)
n <- 4
w <- data.frame(age=rnorm(n, 40, 10),
                sex=sample(c('female','male'), n,TRUE),
                sbp1=rnorm(n, 120, 15),
                sbp2=rnorm(n, 120, 15),
                sbp3=rnorm(n, 120, 15),
                dbp1=rnorm(n,  80, 15),
                dbp2=rnorm(n,  80, 15),
                dbp3=rnorm(n,  80, 15), row.names=letters[1:n])
options(digits=3)
w


u <- reShape(w, base=c('sbp','dbp'), reps=3)
u
reShape(w, base=c('sbp','dbp'), reps=3, timevar='week', times=c(0,3,12))
}
\keyword{manip}
\keyword{array}
\concept{trellis}
\concept{lattice}
\concept{repeated measures}
\concept{longitudinal data}

\eof
\name{reorder.factor}
\alias{reorder.factor}
\title{Reorder Factor Levels}
\description{
Reorders the levels of a factor variable by the values or the summarized
values of another variable
}
\usage{
reorder.factor(x, v, FUN = mean, ...)
}
\arguments{
  \item{x}{a factor variable}
  \item{v}{a numeric variable the same length as \code{x}}
  \item{FUN}{a statistical summarization function applied to \code{v} by
  levels of \code{x}}
  \item{\dots}{other arguments passed to \code{FUN}}
}
\value{
  a new factor vector
}
\seealso{\code{\link{factor}}}
\examples{
x <- factor(c('a','b','b','c'))
v <- c(3,-1,1,-5)
w <- reorder.factor(x, v)  # uses FUN=mean
w
levels(w)
class(w)
}
\keyword{manip}


\eof
\name{rlegend}
\alias{rlegend}
\title{Special Version of legend for R}
\description{
\code{rlegend} is a version of \code{legend} for \R that implements
\code{plot=FALSE}, adds \code{grid=TRUE}, and defaults \code{lty, lwd,
  pch} to \code{NULL} and checks for \code{length>0} rather than
\code{missing()}, so it's easier to deal with non-applicable
parameters.
}
\usage{
rlegend(x, y, legend, fill, col = "black", lty = NULL, lwd = NULL, pch = NULL, angle = NULL, density = NULL, bty = "o", bg = par("bg"), pt.bg = NA, cex = 1, xjust = 0, yjust = 1, x.intersp = 1, y.intersp = 1, adj = 0, text.width = NULL, merge = do.lines && has.pch, trace = FALSE, ncol = 1, horiz = FALSE, plot = TRUE, grid = FALSE, ...)
}
\arguments{
  \item{x}{}
  \item{y}{}
  \item{legend}{}
  \item{fill}{}
  \item{col}{}
  \item{lty}{}
  \item{lwd}{}
  \item{pch}{}
  \item{angle}{}
  \item{density}{}
  \item{bty}{}
  \item{bg}{}
  \item{pt.bg}{}
  \item{cex}{}
  \item{xjust}{}
  \item{yjust}{}
  \item{x.intersp}{}
  \item{y.intersp}{}
  \item{adj}{}
  \item{text.width}{}
  \item{merge}{}
  \item{trace}{}
  \item{ncol}{}
  \item{horiz}{see \code{\link{legend}}}
  \item{plot}{set to \code{FALSE} to suppress drawing the legend.  This
	is used the compute the size needed for when the legend is drawn
	with a later call to \code{rlegend}.}
  \item{grid}{set to \code{TRUE} if the \code{grid} package is in effect}
  \item{\dots}{see \code{legend}}
}
\value{
  a list with elements \code{rect} and \code{text}.  \code{rect} has
  elements \code{w, h, left, top} with size/position information.
}
\author{Frank Harrell and R-Core}
\seealso{\code{\link{legend}}}
\keyword{aplot}


\eof
\name{rm.boot}
\alias{rm.boot}
\alias{plot.rm.boot}
\title{
Bootstrap Repeated Measurements Model
}
\description{
For a dataset containing a time variable, a scalar response variable,
and an optional subject identification variable, obtains least squares
estimates of the coefficients of a restricted cubic spline function or
a linear regression in
time after adjusting for subject effects through the use of subject
dummy variables.  Then the fit is bootstrapped \code{B} times, either by
treating time and subject ID as fixed (i.e., conditioning the analysis
on them) or as random variables.  For the former, the residuals from
the original model fit are used as the basis of the bootstrap
distribution.  For the latter, samples are taken jointly from the
time, subject ID, and response vectors to obtain unconditional
distributions.  

If a subject \code{id} variable is given, the bootstrap sampling will be
based on samples with replacement from subjects rather than from
individual data points.  In other words, either none or all of a given
subject's data will appear in a bootstrap sample.  This cluster
sampling takes into account any correlation structure that might exist
within subjects, so that confidence limits are corrected for
within-subject correlation.  Assuming that ordinary least squares
estimates, which ignore the correlation structure, are consistent
(which is almost always true) and efficient (which would not be true
for certain correlation structures or for datasets in which the
number of observation times vary greatly from subject to subject), the
resulting analysis will be a robust, efficient repeated measures
analysis for the one-sample problem.

Predicted values of the fitted models are evaluated by default at a
grid of 100 equally spaced time points ranging from the minimum to
maximum observed time points.  Predictions are for the average subject
effect.  Pointwise confidence intervals are optionally computed separately for
each of the points on the time grid.  However, simultaneous confidence
regions that control the level of confidence for the entire regression
curve lying within a band are often more appropriate, as they allow
the analyst to draw conclusions about nuances in the mean time
response profile that were not stated apriori.  The method of Tibshirani
(1997) is used to easily obtain simultaneous confidence sets for the
set of coefficients of the spline or linear regression function as
well as the average 
intercept parameter (over subjects).  Here one computes the objective
criterion (here both the -2 log likelihood evaluated at the bootstrap
estimate of beta but with respect to the original design matrix and
response vector, and the sum of squared errors in predicting the
original response vector) for the original fit as well as for all of the
bootstrap fits.  The confidence set of the regression coefficients is
the set of all coefficients that are associated with objective
function values that are less than or equal to say the 0.95 quantile
of the vector of \code{B + 1} objective function values.  For the coefficients
satisfying this condition, predicted curves are computed at the time
grid, and minima and maxima of these curves are computed separately at
each time point to derive the final simultaneous confidence band.

By default, the log likelihoods that are computed for obtaining the
simultaneous confidence band assume independence within subject.  This
will cause problems unless such log likelihoods have very high rank
correlation with the log likelihood allowing for dependence.  To allow
for correlation or to estimate the correlation function, see
the \code{cor.pattern} argument below.
}
\usage{
rm.boot(time, y, id=seq(along=time), subset,
        plot.individual=FALSE,
        bootstrap.type=c('x fixed','x random'),
        nk=6, knots, B=500, smoother=supsmu, 
        xlab, xlim, ylim=range(y), 
        times=seq(min(time),max(time),length=100),
        absorb.subject.effects=FALSE, 
        rho=0, cor.pattern=c('independent','estimate'), ncor=10000,
        \dots)


\method{plot}{rm.boot}(x, obj2, conf.int=.95,
     xlab=x$xlab, ylab=x$ylab, 
     xlim, ylim=x$ylim,
     individual.boot=FALSE,
     pointwise.band=FALSE,
     curves.in.simultaneous.band=FALSE,
     col.pointwise.band=2,
     objective=c('-2 log L','sse','dep -2 log L'), add=FALSE, ncurves,
     multi=FALSE, multi.method=c('color','density'),
     multi.conf   =c(.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,.95,.99),
     multi.density=c( -1,90,80,70,60,50,40,30,20,10,  7,  4),
     multi.col    =c(  1, 8,20, 5, 2, 7,15,13,10,11,  9, 14),
     subtitles=TRUE, \dots)
}
\arguments{
\item{time}{
numeric time vector
}
\item{y}{
continuous numeric response vector of length the same as \code{time}.
Subjects having multiple measurements have the measurements strung out.
}
\item{x}{
an object returned from \code{rm.boot}
}
\item{id}{
subject ID variable.  If
omitted, it is assumed that each time-response pair is measured on a
different subject.
}
\item{subset}{
subset of observations to process if not all the data
}
\item{plot.individual}{
set to \code{TRUE} to plot nonparametrically smoothed time-response curves for
each subject
}
\item{bootstrap.type}{
specifies whether to treat the time and subject ID variables as fixed
or random
}
\item{nk}{
number of knots in the restricted cubic spline function fit.  The
number of knots may be 0 (denoting linear regression) or an integer
greater than 2 in which \code{k} knots results in \code{k-1} regression
coefficients excluding the intercept.  The default is 6 knots.
}
\item{knots}{
vector of knot locations.  May be specified if \code{nk} is omitted.
}
\item{B}{
number of bootstrap repetitions.  Default is 500.
}
\item{smoother}{
a smoothing function that is used if \code{plot.individual=TRUE}.  Default is
\code{supsmu}.
}
\item{xlab}{
label for x-axis.  Default is \code{"units"} attribute of the original
\code{time} variable, or \code{"Time"} if no such attribute was defined using
the \code{units} function.
}
\item{xlim}{
specifies x-axis plotting limits.  Default is to use range of
times specified to \code{rm.boot}.
}
\item{ylim}{
for \code{rm.boot} this is a vector of y-axis limits used if
\code{plot.individual=TRUE}.  It is also passed along for later use by
\code{plot.rm.boot}.  For \code{plot.rm.boot}, \code{ylim} can be specified, to
override the value stored in the object stored by \code{rm.boot}.  The
default is the actual range of \code{y} in the input data.
}
\item{times}{
a sequence of times at which to evaluated fitted values and confidence
limits.  Default is 100 equally spaced points in the observed range of
\code{time}.
}
\item{absorb.subject.effects}{
If \code{TRUE}, adjusts the response vector \code{y} before re-sampling so that the
subject-specific effects in the initial model fit are all zero.  Then
in re-sampling, subject effects are not used in the models.  This will
downplay one of the sources of variation.  This option is used mainly
for checking for consistency of results, as the re-sampling analyses
are simpler when \code{absort.subject.effects=TRUE}.
}
\item{rho}{
The log-likelihood function that is used as the basis of simultaneous
confidence bands assumes normality with independence within subject.
To check the robustness of this assumption, if \code{rho} is not zero, the
log-likelihood under multivariate normality within subject, with
constant correlation \code{rho} between any two time points, is also
computed.  If the two log-likelihoods have the same ranks across
re-samples, alllowing the correlation structure does not matter.  The
agreement in ranks is quantified using the Spearman rank correlation
coefficient.  The \code{plot} method allows the non-zero intra-subject
correlation log-likelihood to be used in deriving the simultaneous
confidence band.  Note that this approach does assume
homoscedasticity.
}
\item{cor.pattern}{
More generally than using an equal-correlation structure, you can
specify a function of two time vectors that generates as many
correlations as the length of these vectors.  For example,
\code{cor.pattern=function(time1,time2).2^(abs(time1-time2)/10)} would
specify a dampening serial correlation pattern.  \code{cor.pattern} can
also be a list containing vectors \code{x} (a vector of absolute time
differences) and \code{y} (a corresponding vector of correlations).  To
estimate the correlation function as a function of absolute time
differences within subjects, specify \code{cor.pattern="estimate"}.  The
products of all possible pairs of residuals (or at least up to \code{ncor}
of them) within subjects will be related to the absolute time
difference.  The correlation function is estimated by computing the
sample mean of the products of standardized residuals, stratified by
absolute time difference.  The correlation for a zero time difference
is set to 1 regardless of the \code{lowess} estimate.  NOTE: This approach
fails in the presence of large subject effects; correcting for such
effects removes too much of the correlation structure in the residuals.
}
\item{ncor}{
the maximum number of pairs of time values used in estimating the
correlation function if \code{cor.pattern="estimate"}
}
\item{...}{
other arguments to pass to \code{smoother} if \code{plot.individual=TRUE}
}
\item{obj2}{
a second object created by \code{rm.boot} that can also be passed to
\code{plot.rm.boot}.  This is used for two-sample problems
for which the time profiles are allowed to differ between the two
groups.  The bootstrapped predicted y values for the second fit
are subtracted from the fitted values for the first fit so that
the predicted mean response for group 1 minus the predicted mean response for
group 2 is what is plotted.  The confidence bands that are plotted are
also for this difference.  For the simultaneous confidence band, the
objective criterion is taken to be the sum of the objective criteria
(-2 log L or sum of squared errors) for the separate fits for the two groups.
The \code{times} vectors must have been identical for both calls to \code{rm.boot},
although \code{NA}s can be inserted by the user of one or both of the time
vectors in the \code{rm.boot} objects so as to suppress certain sections of the 
difference curve from being plotted.
}
\item{conf.int}{
the confidence level to use in constructing simultaneous, and
optionally pointwise, bands.  Default is \code{0.95}.
}
\item{ylab}{
label for y-axis.  Default is the \code{"label"} attribute of the
original \code{y} variable, or \code{"y"} if no label was assigned to \code{y}
(using the \code{label} function, for example).
}
\item{individual.boot}{
set to \code{TRUE} to plot the first 100 bootstrap regression fits
}
\item{pointwise.band}{
set to \code{TRUE} to draw a pointwise confidence band in addition to the
simultaneous band
}
\item{curves.in.simultaneous.band}{
set to \code{TRUE} to draw all bootstrap regression fits that had a sum of
squared errors (obtained by predicting the original \code{y} vector from
the original \code{time} vector and \code{id} vector) that was less that or
equal to the \code{conf.int} quantile of all bootstrapped models (plus the
original model).  This will show how the point by point max and min
were computed to form the simultaneous confidence band.
}
\item{col.pointwise.band}{
color for the pointwise confidence band.  Default is \code{2}, which
defaults to red for default Windows S-PLUS setups.
}
\item{objective}{
the default is to use the -2 log of the Gaussian likelihood for computing
the simultaneous confidence region.  If neither \code{cor.pattern} nor
\code{rho} was specified to \code{rm.boot}, the independent homoscedastic
Gaussian likelihood is used.
Otherwise the dependent homoscedastic likelihood is used according to
the specified or estimated correlation pattern.  
Specify \code{objective="sse"} to instead use the sum of squared errors.
}
\item{add}{
set to \code{TRUE} to add curves to an existing plot.  If you do this, titles and
subtitles are omitted.
}
\item{ncurves}{
when using \code{individual.boot=TRUE} or \code{curves.in.simultaneous.band=TRUE},
you can plot a random sample of \code{ncurves} of the fitted curves instead of plotting up to \code{B} of them.
}
\item{multi}{
set to \code{TRUE} to draw multiple simultaneous confidence bands shaded with
different colors.  Confidence levels vary over the values in the \code{multi.conf}
vector. 
}
\item{multi.method}{
specifies the method of shading when \code{multi=TRUE}.  Default is to use
colors, with the default colors chosen so that when the graph is
printed under S-Plus for Windows 4.0 to an HP LaserJet printer, the
confidence regions are naturally ordered by darkness of gray-scale.
Regions closer to the point estimates (i.e., the center) are darker.
Specify \code{multi.method="density"} to instead use densities of lines
drawn per inch in the confidence regions, with all regions drawn with
the default color.  The \code{polygon} function is used to shade the regions.
}
\item{multi.conf}{
vector of confidence levels, in ascending order.  Default is to use 12
confidence levels ranging from 0.05 to 0.99.
}
\item{multi.density}{
vector of densities in lines per inch corresponding to \code{multi.conf}.
As is the convention in the \code{polygon} function, a density of \code{-1}
indicates a solid region.
}
\item{multi.col}{
vector of colors corresponding to \code{multi.conf}.  See \code{multi.method}
for rationale.
}
\item{subtitles}{
set to \code{FALSE} to suppress drawing subtitles for the plot
}}
\value{
an object of class \code{rm.boot} is returned by \code{rm.boot}.  The principal
object stored in the returned object is a matrix of regression
coefficients for the original fit and all of the bootstrap repetitions
(object \code{Coef}), along with vectors of the corresponding -2 log likelihoods
are sums of squared errors.  The original fit object from \code{lm.fit.qr} is stored
in \code{fit}.  For this fit, a cell means model is used for the \code{id} effects.


\code{plot.rm.boot} returns a list containing the vector of times used for
plotting along with the overall fitted values, lower and upper
simultaneous confidence limits, and optionally the pointwise confidence
limits.
}
\details{
Observations having missing \code{time} or \code{y} are excluded from the
analysis.


As most repeated measurement studies consider the
times as design points, the fixed covariable case is the default.
Bootstrapping the residuals from the initial fit assumes
that the model is correctly specified.  Even if the covariables are
fixed, doing an unconditional bootstrap is still appropriate, and for
large sample sizes unconditional confidence intervals are
only slightly wider than conditional ones.  For moderate to small
sample sizes, the \code{"x random"} method can be fairly conservative.


If not all subjects have the same number of observations (after
deleting observations containing missing values) and if
\code{bootstrap.type="x fixed"}, bootstrapped residual vectors may have a
length \code{m} that is different from the number of original observations
\code{n}.  If \code{m > n} for a bootstrap repetition, the
first \code{n} elements of the randomly drawn residuals are used.  
If \code{m < n}, the residual vector is
appended with a random sample with replacement of length \code{n - m} from
itself.  A warning message is issued if this happens.  If the number
of time points per subject varies, the bootstrap results for \code{"x
fixed"} can still be invalid, as this method assumes that a vector
(over subjects) of all residuals can be added to the original yhats,
and varying number of points will cause mis-alignment.


For \code{bootstrap.type="x random"} in the presence of significant subject
effects, the analysis is approximate as the subjects used in any one
bootstrap fit will not be the entire list of subjects.  The average
(over subjects used in the bootstrap sample) intercept is used from
that bootstrap sample as a predictor of average subject effects in the
overall sample.


Once the bootstrap coefficient matrix is stored by \code{rm.boot},
\code{plot.rm.boot} can be run multiple times with different options
(e.g, different confidence levels).


See \code{bootcov} in the \code{Design} library for a general approach to handling
repeated measurement data for ordinary linear models, binary and
ordinal models, and survival models, using the unconditional
bootstrap.  \code{bootcov} does not handle bootstrapping residuals.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Feng Z, McLerran D, Grizzle J (1996): A comparison of statistical methods for
clustered data analysis with Gaussian error.  Stat in Med 15:1793--1806.


Tibshirani R, Knight K (1997):Model search and inference by bootstrap 
"bumping".  Technical Report, Department of Statistics, University of Toronto.
\cr
http://www-stat.stanford.edu/~tibs. Presented at the Joint Statistical
Meetings, Chicago, August 1996.


Efron B, Tibshirani R (1993): An Introduction to the Bootstrap.
New York: Chapman and Hall.


Diggle PJ, Verbyla AP (1998): Nonparametric estimation of covariance
structure in logitudinal data.  Biometrics 54:401--415.


Chapman IM, Hartman ML, et al (1997): Effect of aging on the
sensitivity of growth hormone secretion to insulin-like growth
factor-I negative feedback.  J Clin Endocrinol Metab 82:2996--3004.
}
\seealso{
\code{\link{rcspline.eval}}, \code{\link{lm}}, \code{\link{lowess}}, \code{\link{supsmu}}, \code{\link[Design]{bootcov}},
\code{\link{units}}, \code{\link{label}}, \code{\link{polygon}}, \code{\link{reShape}}
}
\examples{
# Generate multivariate normal responses with equal correlations (.7)
# within subjects and no correlation between subjects
# Simulate realizations from a piecewise linear population time-response
# profile with large subject effects, and fit using a 6-knot spline
# Estimate the correlation structure from the residuals, as a function
# of the absolute time difference


# Function to generate n p-variate normal variates with mean vector u and
# covariance matrix S
# Slight modification of function written by Bill Venables
# See also the built-in function rmvnorm
mvrnorm <- function(n, p = 1, u = rep(0, p), S = diag(p)) {
  Z <- matrix(rnorm(n * p), p, n)
  t(u + t(chol(S)) \%*\% Z)
}


n     <- 20         # Number of subjects
sub   <- .5*(1:n)   # Subject effects


# Specify functional form for time trend and compute non-stochastic component
times <- seq(0, 1, by=.1)
g     <- function(times) 5*pmax(abs(times-.5),.3)
ey    <- g(times)


# Generate multivariate normal errors for 20 subjects at 11 times
# Assume equal correlations of rho=.7, independent subjects


nt    <- length(times)
rho   <- .7


        
set.seed(19)        
errors <- mvrnorm(n, p=nt, S=diag(rep(1-rho,nt))+rho)
# Note:  first random number seed used gave rise to mean(errors)=0.24!


# Add E[Y], error components, and subject effects
y      <- matrix(rep(ey,n), ncol=nt, byrow=TRUE) + errors + 
          matrix(rep(sub,nt), ncol=nt)


# String out data into long vectors for times, responses, and subject ID
y      <- as.vector(t(y))
times  <- rep(times, n)
id     <- sort(rep(1:n, nt))


# Show lowess estimates of time profiles for individual subjects
f <- rm.boot(times, y, id, plot.individual=TRUE, B=25, cor.pattern='estimate',
             smoother=lowess, bootstrap.type='x fixed', nk=6)
# In practice use B=400 or 500
# This will compute a dependent-structure log-likelihood in addition
# to one assuming independence.  By default, the dep. structure
# objective will be used by the plot method  (could have specified rho=.7)
# NOTE: Estimating the correlation pattern from the residual does not
# work in cases such as this one where there are large subject effects


# Plot fits for a random sample of 10 of the 25 bootstrap fits
plot(f, individual.boot=TRUE, ncurves=10, ylim=c(6,8.5))


# Plot pointwise and simultaneous confidence regions
plot(f, pointwise.band=TRUE, col.pointwise=1, ylim=c(6,8.5))


# Plot population response curve at average subject effect
ts <- seq(0, 1, length=100)
lines(ts, g(ts)+mean(sub), lwd=3)


\dontrun{
#
# Handle a 2-sample problem in which curves are fitted 
# separately for males and females and we wish to estimate the
# difference in the time-response curves for the two sexes.  
# The objective criterion will be taken by plot.rm.boot as the 
# total of the two sums of squared errors for the two models
#
knots <- rcspline.eval(c(time.f,time.m), nk=6, knots.only=TRUE)
# Use same knots for both sexes, and use a times vector that 
# uses a range of times that is included in the measurement 
# times for both sexes
#
tm <- seq(max(min(time.f),min(time.m)),
          min(max(time.f),max(time.m)),length=100)


f.female <- rm.boot(time.f, bp.f, id.f, knots=knots, times=tm)
f.male   <- rm.boot(time.m, bp.m, id.m, knots=knots, times=tm)
plot(f.female)
plot(f.male)
# The following plots female minus male response, with 
# a sequence of shaded confidence band for the difference
plot(f.female,f.male,multi=TRUE)


# Do 1000 simulated analyses to check simultaneous coverage 
# probability.  Use a null regression model with Gaussian errors


n.per.pt <- 30
n.pt     <- 10


null.in.region <- 0


for(i in 1:1000) {
  y    <- rnorm(n.pt*n.per.pt)
  time <- rep(1:n.per.pt, n.pt)
#  Add the following line and add ,id=id to rm.boot to use clustering
#  id   <- sort(rep(1:n.pt, n.per.pt))
#  Because we are ignoring patient id, this simulation is effectively
#  using 1 point from each of 300 patients, with times 1,2,3,,,30 


  f <- rm.boot(time, y, B=500, nk=5, bootstrap.type='x fixed')
  g <- plot(f, ylim=c(-1,1), pointwise=FALSE)
  null.in.region <- null.in.region + all(g$lower<=0 & g$upper>=0)
  prn(c(i=i,null.in.region=null.in.region))
}


# Simulation Results: 905/1000 simultaneous confidence bands 
# fully contained the horizontal line at zero
}
}
\keyword{regression}
\keyword{multivariate}
\keyword{htest}
\keyword{hplot}
\concept{bootstrap}
\concept{repeated measures}
\concept{longitudinal data}

\eof
\name{samplesize.bin}
\alias{samplesize.bin}
\title{
Sample Size for 2-sample Binomial
}
\description{
Computes sample size(s) for 2-sample binomial problem given vector
or scalar probabilities in the two groups.
}
\usage{
samplesize.bin(alpha, beta, pit, pic, rho=0.5)
}
\arguments{
\item{alpha}{
scalar ONE-SIDED test size, or two-sided size/2
}
\item{beta}{
scalar or vector of powers
}
\item{pit}{
hypothesized treatment probability of success
}
\item{pic}{
hypothesized control probability of success
}
\item{rho}{
proportion of the sample devoted to treated group (0 <rho < 1)
}}
\value{
TOTAL sample size(s)
}
\section{AUTHOR}{
Rick Chappell


Dept. of Statistics and Human Oncology


University of Wisconsin at Madison


chappell@stat.wisc.edu
}
\examples{
alpha <- .05
beta <- c(.70,.80,.90,.95)


# N1 is a matrix of total sample sizes whose
# rows vary by hypothesized treatment success probability and
# columns vary by power
# See Meinert's book for formulae.


N1 <- samplesize.bin(alpha, beta, pit=.55, pic=.5)
N1 <- rbind(N1, samplesize.bin(alpha, beta, pit=.60, pic=.5))
N1 <- rbind(N1, samplesize.bin(alpha, beta, pit=.65, pic=.5))
N1 <- rbind(N1, samplesize.bin(alpha, beta, pit=.70, pic=.5))
attr(N1,"dimnames") <- NULL


#Accounting for 5\% noncompliance in the treated group
inflation <- (1/.95)**2
print(round(N1*inflation+.5,0))
}
\keyword{htest}
\keyword{category}
\concept{study design}
\concept{power}

\eof
\name{sasxport.get}
\alias{sasxport.get}
\alias{sasdsLabels}
\title{Enhanced Importing of SAS Transport Files using read.xport}
\description{
Uses the \code{read.xport} and \code{lookup.xport} functions in the
\code{foreign} library to import SAS datasets.  SAS date, time, and
date/time variables are converted to the appropriate POSIX objects in R,
variable names are converted to lower case, SAS labels are associated
with variables, and (by default) integer-valued variables are converted
from storage mode \code{double} to \code{integer}.  If the user ran
\code{PROC FORMAT CNTLOUT=} in SAS and included the resulting dataset in
the SAS version 5 transport file, variables having customized formats
that do not include any ranges (i.e., variables having standard
\code{PROC FORMAT; VALUE} label formats) will have their format labels looked
up, and these variables are converted to S \code{factor}s.

\code{SASdsLabels} reads a file containing \code{PROC CONTENTS}
printed output to parse dataset labels, assuming that \code{PROC
CONTENTS} was run on an entire library.
}
\usage{
sasxport.get(file, force.single = TRUE,
             method=c('read.xport','dataload'), formats=NULL)

sasdsLabels(file)
}
\arguments{
  \item{file}{name of a file containing the SAS transport file.
	\code{file} may be a URL beginning with \code{http://}.  For
\code{sasdsLabels}, \code{file} is the name of a file containing a
\code{PROC CONTENTS} output listing.
}
  \item{force.single}{set to \code{FALSE} to keep integer-valued
	variables not exceeding \eqn{2^31-1} in value from being converted to
	\code{integer} storage mode}
  \item{method}{set to \code{"dataload"} if you have the \code{dataload}
	executable installed and want to use it instead of
	\code{read.xport}.  This seems to correct some errors in which
	rarely some factor variables are always missing when read by
	\code{read.xport} when in fact they have some non-missing values.}
  \item{formats}{a data frame or list (like that created by
	\code{read.xport}) containing \code{PROC FORMAT}
	output, if such output is not stored in the main transport file.}
}
\value{
  If there is more than one dataset in the transport file other than the
  \code{PROC FORMAT} file, the result is a list of data frames
  containing all the non-\code{PROC FORMAT} datasets.  Otherwise the
  result is the single data frame.  \code{sasdsLabels} returns a named
  vector of dataset labels, with names equal to the dataset names.
}
\details{See \code{\link{contents.list}} for a way to print the
directory of SAS datasets when more than one was imported.}
\author{Frank E Harrell Jr}
\seealso{\code{\link{read.xport}},\code{\link{label}},\code{\link{sas.get}},
  \code{\link{DateTimeClasses}},\code{\link{lookup.xport}},
  \code{\link{contents}},\code{\link{describe}}}
\examples{
\dontrun{
# SAS code to generate test dataset:
# libname y SASV5XPT "test2.xpt";
#
# PROC FORMAT; VALUE race 1=green 2=blue 3=purple; RUN;
# PROC FORMAT CNTLOUT=format;RUN;  * Name, e.g. 'format', unimportant;
# data test;
# LENGTH race 3 age 4;
# age=30; label age="Age at Beginning of Study";
# race=2;
# d1='3mar2002'd ;
# dt1='3mar2002 9:31:02'dt;
# t1='11:13:45't;
# output;
#
# age=31;
# race=4;
# d1='3jun2002'd ;
# dt1='3jun2002 9:42:07'dt;
# t1='11:14:13't;
# output;
# format d1 mmddyy10. dt1 datetime. t1 time. race race.;
# run;
# data z; LENGTH x3 3 x4 4 x5 5 x6 6 x7 7 x8 8;
#    DO i=1 TO 100;
#        x3=ranuni(3);
#        x4=ranuni(5);
#        x5=ranuni(7);
#        x6=ranuni(9);
#        x7=ranuni(11);
#        x8=ranuni(13);
#        output;
#        END;
#    DROP i;
#    RUN;
# PROC MEANS; RUN;
# PROC COPY IN=work OUT=y;SELECT test format z;RUN; *Creates test2.xpt;
w <- sasxport.get('test2.xpt')
# To use an existing copy of test2.xpt available on the web:
w <- sasxport.get('http://hesweb1.med.virginia.edu/biostat/s/data/sas/test2.xpt')

describe(w$test)   # see labels, format names for dataset test
# Note: if only one dataset (other than format) had been exported,
# just do describe(w) as sasxport.get would not create a list for that
lapply(w, describe)# see descriptive stats for both datasets
contents(w$test)   # another way to see variable attributes
lapply(w, contents)# show contents of both datasets
options(digits=7)  # compare the following matrix with PROC MEANS output
t(sapply(w$z, function(x)
 c(Mean=mean(x),SD=sqrt(var(x)),Min=min(x),Max=max(x))))
}
}
\keyword{interface}
\keyword{manip}

\eof
\name{scat1d}
\alias{scat1d}
\alias{jitter2}
\alias{jitter2.default}
\alias{jitter2.data.frame}
\alias{datadensity}
\alias{datadensity.data.frame}
\alias{histSpike}
\title{
One-Dimensional Scatter Diagram, Spike Histogram, or Density
}
\description{
\code{scat1d} adds tick marks (bar codes. rug plot) on any of the four
sides of an existing plot, corresponding with non-missing values of a
vector \code{x}.  This is used to show the data density.  Can also place
the tick marks along a curve by specifying y-coordinates to go along
with the \code{x} values. 


If any two values of \code{x} are within \code{eps*w} of each other, where \code{eps}
defaults to .001 and \code{w} is the span of the intended axis, values of
\code{x} are jittered by adding a value uniformly distributed in
\code{[-jitfrac*w, jitfrac*w]}, where \code{jitfrac} defaults to .008.
Specifying \code{preserve=TRUE} invokes \code{jitter2} with a different logic of
jittering. Allows plotting random sub-segments to handle very large
\code{x} vectors (see \code{tfrac}).


\code{jitter2} is a generic method for jittering, which does not add
random noise. It retains unique values and ranks, and randomly
spreads duplicate values at equidistant positions within limits of
enclosing values. \code{jitter2} is especially useful for numeric
variables with discrete values, like rating scales. Missing values
are allowed and are returned. Currently implemented methods are
\code{jitter2.default} for vectors and \code{jitter2.data.frame} which returns
a data.frame with each numeric column jittered.


\code{datadensity} is a generic method used to show data densities in more
complex situations.  In the Design library there is a \code{datadensity}
method for use with \code{plot.Design}.  Here, another \code{datadensity} method
is defined for data frames.  Depending on the \code{which} argument, some
or all of the variables in a data frame will be displayed, with
\code{scat1d} used to display continuous variables and, by default, bars
used to display frequencies of categorical, character, or discrete
numeric variables.  For such variables, when the total length of value
labels exceeds 200, only the first few characters from each level are used.
By default, \code{datadensity.data.frame} will construct
one axis (i.e., one strip) per variable in the data frame.  Variable
names appear to the left of the axes, and the number of missing values
(if greater than zero) appear to the right of the axes.  An optional
\code{group} variable can be used for stratification, where the different
strata are depicted using different colors.  If the \code{q} vector is
specified, the desired quantiles (over all \code{group}s) are displayed
with solid triangles below each axis.


When the sample size exceeds 2000 (this value may be modified using
the \code{nhistSpike} argument, \code{datadensity} calls \code{histSpike} instead of
\code{scat1d} to show the data density for numeric variables.  This results
in a histogram-like display that makes the resulting graphics file
much smaller.  In this case, \code{datadensity} uses the \code{minf} argument
(see below) so that very infrequent data values will not be lost on
the variable's axis, although this will slightly distort the histogram.


\code{histSpike} is another method for showing a high-resolution data
distribution that is particularly good for very large datasets (say
\code{n} > 1000).  By
default, \code{histSpike} bins the continuous \code{x} variable into 100
equal-width bins and then computes the frequency counts within bins.
If \code{add=FALSE} (the default), the function displays either proportions or
frequencies as in a vertical histogram.  Instead of bars, spikes are
used to depict the frequencies.  If \code{add=FALSE}, the function assumes you
are adding small density displays that are intended to take up a small
amount of space in the margins of the overall plot.  The \code{frac}
argument is used as with \code{scat1d} to determine the relative length of
the whole plot that is used to represent the maximum frequency.  No
jittering is done by \code{histSpike}.


\code{histSpike} can also graph a kernel density estimate for \code{x}, or add a
small density curve to any of 4 sides of an existing plot.  When \code{y}
or \code{curve} is specified, the density or spikes are drawn with respect
to the curve rather than the x-axis.
}
\usage{
scat1d(x, side=3, frac=0.02, jitfrac=0.008, tfrac,
       eps=ifelse(preserve,0,.001),
       lwd=0.1, col=par("col"),
       y=NULL, curve=NULL,
       bottom.align=FALSE,
       preserve=FALSE, fill=1/3, limit=TRUE, nhistSpike=2000, nint=100,
       type=c('proportion','count','density'), grid=FALSE, \dots)

jitter2(x, ...)

\method{jitter2}{default}(x, fill=1/3, limit=TRUE, eps=0, presorted=FALSE, ...)

\method{jitter2}{data.frame}(x, ...)

datadensity(object, ...)

\method{datadensity}{data.frame}(object, group,
            which=c("all","continuous","categorical"),
            method.cat=c("bar","freq"),
            col.group=1:10,
            n.unique=10, show.na=TRUE, nint=1, naxes,
            q, bottom.align=nint>1,
            cex.axis=sc(.5,.3), cex.var=sc(.8,.3),
            lmgp=NULL, tck=sc(-.009,-.002),
            ranges=NULL, labels=NULL, \dots)
# sc(a,b) means default to a if number of axes <= 3, b if >=50, use
# linear interpolation within 3-50

histSpike(x, side=1, nint=100, frac=.05, minf=NULL, mult.width=1,
          type=c('proportion','count','density'),
          xlim=range(x), ylim=c(0,max(f)), xlab=deparse(substitute(x)), 
          ylab=switch(type,proportion='Proportion',
                           count     ='Frequency',
                           density   ='Density'),
          y=NULL, curve=NULL, add=FALSE, 
          bottom.align=type=='density', col=par('col'), lwd=par('lwd'),
          grid=FALSE, ...)
}
\arguments{
\item{x}{
a vector of numeric data, or a data frame (for \code{jitter2})
}
\item{object}{
a data frame or list (even with unequal number of observations per
variable, as long as \code{group} is not specified)
}
\item{side}{
axis side to use (1=bottom (default for \code{histSpike}), 2=left, 
3=top (default for \code{scat1d}), 4=right)
}
\item{frac}{
fraction of smaller of vertical and horizontal axes for tick mark lengths.
Can be negative to move tick marks outside of plot.  For \code{histSpike},
this is the relative length to be used for the largest frequency.
When \code{scat1d} calls \code{histSpike}, it multiplies its \code{frac} argument by 2.5.
}
\item{jitfrac}{
fraction of axis for jittering.  If <=0, no jittering is done. If
\code{preserve=TRUE}, the amount of jittering is independent of jitfrac.
}
\item{tfrac}{
fraction of tick mark to actually draw.  If \code{tfrac<1},
will draw a random fraction \code{tfrac} of the line segment at each point.
This is useful for very large samples or ones with some very dense points.
The default value is 1 if the number of non-missing observations \code{n}
is less than 125, and \code{max(.1, 125/n)} otherwise.
}
\item{eps}{
fraction of axis for determining overlapping points in \code{x}. For
\code{preserve=TRUE} the default is 0 and original unique values are
retained, bigger values of eps tends to bias observations from dense
to sparse regions, but ranks are still preserved.
}
\item{lwd}{
line width for tick marks, passed to \code{segments}
}
\item{col}{
color for tick marks, passed to \code{segments}
}
\item{y}{
specify a vector the same length as \code{x} to draw tick marks along
a curve instead of by one of the axes.  The \code{y} values are often
predicted values from a model.  The \code{side} argument is ignored
when \code{y} is given.  If the curve is already represented as a table
look-up, you may specify it using the \code{curve} argument instead.  \code{y}
may be a scalar to use a constant vertical placement.
}
\item{curve}{
a list containing elements \code{x} and \code{y} for which linear interpolation
is used to derive \code{y} values corresponding to values of \code{x}.  This
results in tick marks being drawn along the curve.  For \code{histSpike},
interpolated \code{y} values are derived for bin midpoints.
}
\item{bottom.align}{
set to \code{TRUE} to have the bottoms of tick marks (for \code{side=1} or
\code{side=3}) aligned at the y-coordinate.  The default behavior is to
center the tick marks.  For \code{datadensity.data.frame}, \code{bottom.align}
defaults to \code{TRUE} if \code{nint>1}.  In other words, if you are only labeling
the first and last axis tick mark, the \code{scat1d} tick marks are
centered on the variable's axis.
}
\item{preserve}{
set to \code{TRUE} to invoke \code{jitter2}
}
\item{fill}{
maximum fraction of the axis filled by jittered values. If \code{d} are
duplicated values between a lower value \code{l} and upper value \code{u}, then
\code{d} will be spread within \code{+/- fill*min(u-d,d-l)/2}.
}
\item{limit}{
specifies a limit for maximum shift in jittered values. Duplicate
values will be spread within \code{+/- fill*min(limit,min(u-d,d-l)/2)}. The
default \code{TRUE} restricts jittering to the smallest min(u-d,d-l)/2 observed and
results in equal amount of jittering for all d. Setting to \code{FALSE}
allows for locally different amount of jittering, using maximum
space available.
}
\item{nhistSpike}{
If the number of observations exceeds or equals \code{nhistSpike}, \code{scat1d}
will automatically call \code{histSpike} to draw the data density, to
prevent the graphics file from being too large.
}
\item{type}{
used by or passed to \code{histSpike}.  Set to \code{"count"} to display
frequency counts rather than relative frequencies, or \code{"density"} to
display a kernel density estimate computed using the \code{density} function.
}
\item{grid}{
  set to \code{TRUE} if the \R \code{grid} package is in effect for the
  current plot
}
\item{nint}{
number of intervals to divide each continuous variable's axis for
\code{datadensity}. 
For \code{histSpike}, is the number of equal-width intervals for which to
bin \code{x}, and if instead \code{nint} is a character string (e.g.,
\code{nint="all"}), the frequency tabulation is done with no binning.  In
other words, frequencies for all unique values of \code{x} are derived and
plotted.
}
\item{...}{
optional arguments passed to \code{scat1d} from \code{datadensity} or to
\code{histSpike} from \code{scat1d}
}
\item{presorted}{
set to \code{TRUE} to prevent from sorting for determining the order l<d<u.
This is usefull if an existing meaningfull local order would be
destroyed by sorting, as in sin(pi*sort(round(runif(1000,0,10),1))).
}
\item{group}{
an optional stratification variable, which is converted to a \code{factor}
vector if it is not one already
}
\item{which}{
set \code{which="continuous"} to only plot continuous variables, or
\code{which="categorical"} to only plot categorical, character, or discrete
numeric ones.  By default, all types of variables are depicted.
}
\item{method.cat}{
set \code{method.cat="freq"} to depict frequencies of categorical variables
with digits representing the cell frequencies, with size proportional
to the square root of the frequency.  By default, vertical bars are used.
}
\item{col.group}{
colors representing the \code{group} strata.  The vector of colors is
recycled to be the same length as the levels of \code{group}.
}
\item{n.unique}{
number of unique values a numeric variable must have before it is
considered to be a continuous variable
}
\item{show.na}{
set to \code{FALSE} to suppress drawing the number of \code{NA}s to the right of
each axis
}
\item{naxes}{
number of axes to draw on each page before starting a new plot.  You
can set \code{naxes} larger than the number of variables in the data frame
if you want to compress the plot vertically.
}
\item{q}{
a vector of quantiles to display.  By default, quantiles are not shown.
}
\item{cex.axis}{
character size for draw labels for axis tick marks
}
\item{cex.var}{
character size for variable names and frequence of \code{NA}s
}
\item{lmgp}{
spacing between numeric axis labels and axis (see \code{par} for \code{mgp})
}
\item{tck}{
see \code{tck} under \code{par}
}
\item{ranges}{
a list containing ranges for some or all of the numeric variables.  If
\code{ranges} is not given or if a certain variable is not found in the
list, the empirical range, modified by \code{pretty}, is used.  Example:
\code{ranges=list(age=c(10,100), pressure=c(50,150))}.
}
\item{labels}{
a vector of labels to use in labeling the axes for
\code{datadensity.data.frame}.  Default is to use the names of the
variables in the input data frame.  Note: margin widths computed for
setting aside names of variables use the names, and not these labels.
}
\item{minf}{
For \code{histSpike}, if \code{minf} is specified low bin frequencies are set to
a minimum value of \code{minf} times the maximum bin frequency, so that
rare data points will remain visible.  A good choice of \code{minf} is
0.075.  \code{datadensity.data.frame} passes \code{minf=0.075} to \code{scat1d} to
pass to \code{histSpike}.  Note that specifying \code{minf} will cause the shape
of the histogram to be distorted somewhat.
}
\item{mult.width}{
multiplier for the smoothing window width computed by \code{histSpike} when
\code{type="density"}
}
\item{xlim}{
a 2-vector specifying the outer limits of \code{x} for binning (and
plotting, if \code{add=FALSE} and \code{nint} is a number)
}
\item{ylim}{
\code{y}-axis range for plotting (if \code{add=FALSE})
}
\item{xlab}{
\code{x}-axis label (\code{add=FALSE}); default is name of input argument \code{x}
}
\item{ylab}{
\code{y}-axis label (\code{add=FALSE})
}
\item{add}{
set to \code{TRUE} to add the spike-histogram to an existing plot, to show
marginal data densities
}
}
\value{
\code{histSpike} returns the actual range of \code{x} used in its binning
}
\section{Side Effects}{
\code{scat1d} adds line segments to plot.  \code{datadensity.data.frame} draws a
complete plot.  \code{histSpike} draws a complete plot or adds to an
existing plot.
}
\details{
For \code{scat1d} the length of line segments used is \code{frac*min(par()$pin)
/ par()$uin[opp]} data units, where \code{opp} is the index of the opposite
axis and \code{frac} defaults to .02.  Assumes that \code{plot} has already been
called.  Current \code{par("usr")} is used to determine the range of data
for the axis of the current plot.  This range is used in jittering and
in constructing line segments.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
Charlottesville VA, USA
\cr
f.harrell@vanderbilt.edu


Martin Maechler (improved \code{scat1d})
\cr
Seminar fuer Statistik
\cr
ETH Zurich SWITZERLAND
\cr
maechler@stat.math.ethz.ch


Jens Oehlschlaegel-Akiyoshi (wrote \code{jitter2})
\cr
Center for Psychotherapy Research
\cr
Christian-Belser-Strasse 79a
\cr
D-70597 Stuttgart Germany
\cr
oehl@psyres-stuttgart.de
}
\seealso{
\code{\link{segments}}, \code{\link{jitter}}, \code{\link{rug}}, \code{\link{plsmo}}, \code{\link{stripplot}}, 
\code{\link{hist.data.frame}},\code{\link{ecdf}},
\code{\link{hist}}, \code{\link{histogram}}, \code{\link{table}}, \code{\link{density}}
}
\examples{
plot(x <- rnorm(50), y <- 3*x + rnorm(50)/2 )
scat1d(x)                 # density bars on top of graph
scat1d(y, 4)              # density bars at right
histSpike(x, add=TRUE)       # histogram instead, 100 bins
histSpike(y, 4, add=TRUE)
histSpike(x, type='density', add=TRUE)  # smooth density at bottom
histSpike(y, 4, type='density', add=TRUE)


smooth <- lowess(x, y)    # add nonparametric regression curve
lines(smooth)             # Note: plsmo() does this
scat1d(x, y=approx(smooth, xout=x)$y) # data density on curve
scat1d(x, curve=smooth)   # same effect as previous command
histSpike(x, curve=smooth, add=TRUE) # same as previous but with histogram
histSpike(x, curve=smooth, type='density', add=TRUE)  
# same but smooth density over curve


plot(x <- rnorm(250), y <- 3*x + rnorm(250)/2)
scat1d(x, tfrac=0)        # dots randomly spaced from axis
scat1d(y, 4, frac=-.03)   # bars outside axis
scat1d(y, 2, tfrac=.2)    # same bars with smaller random fraction


x <- c(0:3,rep(4,3),5,rep(7,10),9)
plot(x, jitter2(x))       # original versus jittered values
abline(0,1)               # unique values unjittered on abline
points(x+0.1, jitter2(x, limit=FALSE), col=2)
                          # allow locally maximum jittering
points(x+0.2, jitter2(x, fill=1), col=3); abline(h=seq(0.5,9,1), lty=2)
                          # fill 3/3 instead of 1/3
x <- rnorm(200,0,2)+1; y <- x^2
x2 <- round((x+rnorm(200))/2)*2
x3 <- round((x+rnorm(200))/4)*4
dfram <- data.frame(y,x,x2,x3)
plot(dfram$x2, dfram$y)   # jitter2 via scat1d
scat1d(dfram$x2, y=dfram$y, preserve=TRUE, col=2)
scat1d(dfram$x2, preserve=TRUE, frac=-0.02, col=2)
scat1d(dfram$y, 4, preserve=TRUE, frac=-0.02, col=2)


pairs(jitter2(dfram))     # pairs for jittered data.frame
# This gets reasonable pairwise scatter plots for all combinations of
# variables where
#
# - continuous variables (with unique values) are not jittered at all, thus
#   all relations between continuous variables are shown as they are,
#   extreme values have exact positions.
#
# - discrete variables get a reasonable amount of jittering, whether they
#   have 2, 3, 5, 10, 20 \dots levels
#
# - different from adding noise, jitter2() will use the available space
#   optimally and no value will randomly mask another
#
# If you want a scatterplot with lowess smooths on the *exact* values and
# the point clouds shown jittered, you just need
#
pairs( dfram ,panel=function(x,y) { points(jitter2(x),jitter2(y))
                                    lines(lowess(x,y)) } )




datadensity(dfram)     # graphical snapshot of entire data frame
datadensity(dfram, group=cut2(dfram$x2,g=3))
                          # stratify points and frequencies by
                          # x2 tertiles and use 3 colors


# datadensity.data.frame(split(x, grouping.variable))
# need to explicitly invoke datadensity.data.frame when the
# first argument is a list
}
\keyword{dplot}
\keyword{aplot}
\keyword{hplot}
\keyword{distribution}
% Converted by Sd2Rd version 1.21.

\eof
\name{score.binary}
\alias{score.binary}
\title{
Score a Series of Binary Variables
}
\description{
Creates a new variable from a series of logical conditions.  The new
variable can be a hierarchical category or score derived from considering
the rightmost \code{TRUE} value among the input variables, an additive point
score, a union, or any of several others by specifying a function using the
\code{fun} argument.
}
\usage{
score.binary(\dots, fun=max, points=1:p, 
             na.rm=funtext == "max", retfactor=TRUE)
}
\arguments{
\item{...}{
a list of variables or expressions which are considered to be binary
or logical
}
\item{fun}{
a function to compute on each row of the matrix represented by
a specific observation of all the variables in \code{\dots}
}
\item{points}{
points to assign to successive elements of \code{\dots} .  The default is
\code{1, 2, \dots, p}, where \code{p} is the number of elements.  If you specify
one number for \code{points}, that number will be duplicated (i.e., equal weights
are assumed).
}
\item{na.rm}{
set to \code{TRUE} to remove \code{NA}s from consideration when processing
each row of the matrix of variables in \code{\dots} .  For \code{fun=max},
\code{na.rm=TRUE} is the default since \code{score.binary} assumes that a
hierarchical scale is based on available information.  Otherwise,
\code{na.rm=FALSE} is assumed.  For \code{fun=mean} you may want to specify
\code{na.rm=TRUE}.
}
\item{retfactor}{
applies if \code{fun=max}, in which case \code{retfactor=TRUE} makes \code{score.binary}
return a \code{factor} object since a hierarchical scale implies
a unique choice.
}}
\value{
a \code{factor} object if \code{retfactor=TRUE} and \code{fun=max} or a numeric vector
otherwise.  Will not contain NAs if \code{na.rm=TRUE} unless every variable in
a row is \code{NA}.  If a \code{factor} object
is returned, it has levels \code{"none"} followed by character
string versions of the arguments given in \code{\dots} .
}
\seealso{
\code{\link{any}}, \code{\link{sum}}, \code{\link{max}}, \code{\link{factor}}
}
\examples{
set.seed(1)
age <- rnorm(25, 70, 15)
previous.disease <- sample(0:1, 25, TRUE)
#Hierarchical scale, highest of 1:age>70  2:previous.disease
score.binary(age>70, previous.disease, retfactor=FALSE)
#Same as above but return factor variable with levels "none" "age>70" 
# "previous.disease"
score.binary(age>70, previous.disease)


#Additive scale with weights 1:age>70  2:previous.disease
score.binary(age>70, previous.disease, fun=sum)
#Additive scale, equal weights
score.binary(age>70, previous.disease, fun=sum, points=c(1,1))
#Same as saying points=1


#Union of variables, to create a new binary variable
score.binary(age>70, previous.disease, fun=any)
}
\keyword{manip}
% Converted by Sd2Rd version 1.21.

\eof
\name{sedit}
\alias{sedit}
\alias{substring.location}
\alias{substring2}
\alias{substring2<-}
\alias{replace.substring.wild}
\alias{numeric.string}
\alias{all.digits}
\title{
Character String Editing and Miscellaneous Character Handling Functions
}
\description{
This suite of functions was written to implement many of the features
of the UNIX \code{sed} program entirely within S-PLUS (function \code{sedit}).
The \code{substring.location} function returns the first and last position
numbers that a sub-string occupies in a larger string.  The \code{substring2<-}
function does the opposite of the builtin function \code{substring}.
It is named \code{substring2} because for S-Plus 5.x there is a built-in
function \code{substring}, but it does not handle multiple replacements in
a single string.
\code{replace.substring.wild} edits character strings in the fashion of
"change xxxxANYTHINGyyyy to aaaaANYTHINGbbbb", if the "ANYTHING"
passes an optional user-specified \code{test} function.  Here, the
"yyyy" string is searched for from right to left to handle
balancing parentheses, etc.  \code{numeric.string}
and \code{all.digits} are two examples of \code{test} functions, to check,
respectively if each of a vector of strings is a legal numeric or if it contains only
the digits 0-9.  For the case where \code{old="*$" or "^*"}, or for
\code{replace.substring.wild} with the same values of \code{old} or with
\code{front=TRUE} or \code{back=TRUE}, \code{sedit} (if \code{wild.literal=FALSE}) and
\code{replace.substring.wild} will edit the largest substring
satisfying \code{test}.

\code{substring2} is just a copy of \code{substring} so that
\code{substring2<-} will work.
}
\usage{
sedit(text, from, to, test, wild.literal=FALSE)
substring.location(text, string, restrict)
# substring(text, first, last) <- setto   # S-Plus only
replace.substring.wild(text, old, new, test, front=FALSE, back=FALSE)
numeric.string(string)
all.digits(string)
substring2(text, first, last=1e6)
substring2(text, first, last) <- value
}
\arguments{
\item{text}{
a vector of character strings for \code{sedit, substring2, substring2<-}
or a single character string for \code{substring.location,
  replace.substring.wild}.
}
\item{from}{
a vector of character strings to translate from, for \code{sedit}.
A single asterisk wild card, meaning allow any sequence of characters
(subject to the \code{test} function, if any) in place of the \code{"*"}.
An element of \code{from} may begin with \code{"^"} to force the match to
begin at the beginning of \code{text}, and an element of \code{from} can end with
\code{"$"} to force the match to end at the end of \code{text}.
}
\item{to}{
a vector of character strings to translate to, for \code{sedit}.
If a corresponding element in \code{from} had an \code{"*"}, the element
in \code{to} may also have an \code{"*"}.  Only single asterisks are allowed.
If \code{to} is not the same length as \code{from}, the \code{rep} function
is used to make it the same length.
}
\item{string}{
a single character string, for \code{substring.location}, \code{numeric.string},
\code{all.digits}
}
\item{first}{
a vector of integers specifying the first position to replace for
\code{substring2<-}.  \code{first} may also be a vector of character strings
that are passed to \code{sedit} to use as patterns for replacing
substrings with \code{setto}.  See one of the last examples below.
}
\item{last}{
a vector of integers specifying the ending positions of the character
substrings to be replaced.  The default is to go to the end of
the string.  When \code{first} is character, \code{last} must be
omitted.
}
\item{setto}{
a character string or vector of character strings used as replacements,
in \code{substring2<-}
}
\item{old}{
a character string to translate from for \code{replace.substring.wild}.
May be \code{"*$"} or \code{"^*"} or any string containing a single \code{"*"} but
not beginning with \code{"^"} or ending with \code{"$"}.
}
\item{new}{
a character string to translate to for \code{replace.substring.wild}
}
\item{test}{
a function of a vector of character strings returning a logical vector
whose elements are \code{TRUE} or \code{FALSE} according
to whether that string element qualifies as the wild card string for
\code{sedit, replace.substring.wild}
}
\item{wild.literal}{
set to \code{TRUE} to not treat asterisks as wild cards and to not look for
\code{"^"} or \code{"$"} in \code{old}
}
\item{restrict}{
a vector of two integers for \code{substring.location} which specifies a
range to which the search for matches should be restricted
}
\item{front}{
specifying \code{front=TRUE} and \code{old="*"} is the same as specifying \code{old="^*"}
}
\item{back}{
specifying \code{back=TRUE} and \code{old="*"} is the same as specifying \code{old="*$"}
}
\item{value}{a character vector}
}
\value{
\code{sedit} returns a vector of character strings the same length as \code{text}.
\code{substring.location} returns a list with components named \code{first}
and \code{last}, each specifying a vector of character positions corresponding
to matches.  \code{replace.substring.wild} returns a single character string.
\code{numeric.string} and \code{all.digits} return a single logical value.
}
\section{Side Effects}{
\code{substring2<-} modifies its first argument
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{grep}}, \code{\link{substring}}
}
\examples{
x <- 'this string'
substring2(x, 3, 4) <- 'IS'
x
substring2(x, 7) <- ''
x


substring.location('abcdefgabc', 'ab')
substring.location('abcdefgabc', 'ab', restrict=c(3,999))


replace.substring.wild('this is a cat','this*cat','that*dog')
replace.substring.wild('there is a cat','is a*', 'is not a*')
replace.substring.wild('this is a cat','is a*', 'Z')


qualify <- function(x) x==' 1.5 ' | x==' 2.5 '
replace.substring.wild('He won 1.5 million $','won*million',
                       'lost*million', test=qualify)
replace.substring.wild('He won 1 million $','won*million',
                       'lost*million', test=qualify)
replace.substring.wild('He won 1.2 million $','won*million',
                       'lost*million', test=numeric.string)


x <- c('a = b','c < d','hello')
sedit(x, c('=','he*o'),c('==','he*'))


sedit('x23', '*$', '[*]', test=numeric.string)
sedit('23xx', '^*', 'Y_{*} ', test=all.digits)


replace.substring.wild("abcdefabcdef", "d*f", "xy")


x <- "abcd"
substring2(x, "bc") <- "BCX"
x
substring2(x, "B*d") <- "B*D"
x
}
\keyword{manip}
\keyword{character}
% Converted by Sd2Rd version 1.21.

\eof
\name{show.pch}
\alias{show.pch}
\alias{show.col}
\alias{character.table}
\title{Display Colors, Plotting Symbols, and Symbol Numeric Equivalents}
\description{
\code{show.pch} plots the definitions of the \code{pch} parameters.
\code{show.col} plots definitions of integer-valued colors.
\code{character.table} draws numeric equivalents of all latin
characters; the character on line \code{xy} and column \code{z} of the
table has numeric code \code{"xyz"}, which you would surround in quotes
and preceed by a backslash.
}
\usage{
show.pch(object = par("font"))
show.col(object=NULL)
character.table(font=1)
}
\arguments{
  \item{object}{font for \code{show.pch}, ignored for \code{show.col}.}
  \item{font}{font}
}
\author{Pierre Joyet \email{pierre.joyet@bluewin.ch}, Frank Harrell}
\seealso{\code{\link{points}}, \code{\link{text}}}
\examples{
\dontrun{
show.pch()
show.col()
character.table()
}}
\keyword{aplot}

\eof
\name{smean.sd}
\alias{smean.cl.normal}
\alias{smean.sd}
\alias{smean.sdl}
\alias{smean.cl.boot}
\alias{smedian.hilow}
\title{
Compute Summary Statistics on a Vector
}
\description{
A number of statistical summary functions is provided for use
with \code{summary.formula} and \code{summarize} (as well as
\code{tapply} and by themselves).
\code{smean.cl.normal} computes 3 summary variables: the sample mean and
lower and upper Gaussian confidence limits based on the t-distribution.
\code{smean.sd} computes the mean and standard deviation.
\code{smean.sdl} computes the mean plus or minus a constant times the
standard deviation.
\code{smean.cl.boot} is a very fast implementation of the basic
nonparametric bootstrap for obtaining confidence limits for the
population mean without assuming normality.
These functions all delete NAs automatically.
\code{smedian.hilow} computes the sample median and a selected pair of
outer quantiles having equal tail areas.
}
\usage{
smean.cl.normal(x, mult=qt((1+conf.int)/2,n-1), conf.int=.95, na.rm=TRUE)

smean.sd(x, na.rm=TRUE)

smean.sdl(x, mult=2, na.rm=TRUE)

smean.cl.boot(x, conf.int=.95, B=1000, na.rm=TRUE, reps=FALSE)

smedian.hilow(x, conf.int=.95, na.rm=TRUE)
}
\arguments{
\item{x}{
for summary functions \code{smean.*}, \code{smedian.hilow}, a numeric vector
from which NAs will be removed automatically
}
\item{na.rm}{
defaults to \code{TRUE} unlike built-in S-Plus functions, so that by
default \code{NA}s are automatically removed
}
\item{mult}{
for \code{smean.cl.normal} is the multiplier of the standard error of the
mean to use in obtaining confidence limits of the population mean
(default is appropriate quantile of the t distribution).  For
\code{smean.sdl}, \code{mult} is the multiplier of the standard deviation used
in obtaining a coverage interval about the sample mean.  The default
is \code{mult=2} to use plus or minus 2 standard deviations.
}
\item{conf.int}{
for \code{smean.cl.normal} and \code{smean.cl.boot} specifies the confidence
level (0-1) for interval estimation of the population mean.  For
\code{smedian.hilow}, \code{conf.int} is the coverage probability the outer
quantiles should target.  When the default, 0.95, is used, the lower
and upper quantiles computed are 0.025 and 0.975.
}
\item{B}{
number of bootstrap resamples for \code{smean.cl.boot}
}
\item{reps}{
set to \code{TRUE} to have \code{smean.cl.boot} return the vector of bootstrapped
means as the \code{reps} attribute of the returned object
}
}
\value{
a vector of summary statistics
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
  \code{\link{summarize}}, \code{\link{summary.formula}}
}
\examples{
set.seed(1)
x <- rnorm(100)
smean.sd(x)
smean.sdl(x)
smean.cl.normal(x)
smean.cl.boot(x)
smedian.hilow(x, conf.int=.5)  # 25th and 75th percentiles
}
\keyword{nonparametric}
\keyword{htest}
\concept{bootstrap}

\eof
\name{solvet}
\alias{solvet}
\title{
solve Function with tol argument
}
\description{
A slightly modified version of \code{solve} that allows a tolerance argument
for singularity (\code{tol}) which is passed to \code{qr}.
}
\usage{
solvet(a, b, tol=1e-09)
}
\arguments{
  \item{a}{a square numeric matrix}
  \item{b}{a numeric vector or matrix}
  \item{tol}{tolerance for detecting linear dependencies in columns of
	\code{a}}
  }
\seealso{
\code{\link{solve}}
}
\keyword{array}
\keyword{algebra}


\eof
\name{somers2}
\alias{somers2}
\title{
Somers' Dxy Rank Correlation
}
\description{
Computes Somers' Dxy rank correlation between a variable \code{x} and a
binary (0-1) variable \code{y}, and the corresponding receiver operating
characteristic curve area \code{c}. Note that \code{Dxy = 2(c-0.5)}.  
\code{somers} allows for a \code{weights} variable, which specifies frequencies
to associate with each observation.
}
\usage{
somers2(x, y, weights=NULL, normwt=FALSE, na.rm=TRUE)
}
\arguments{
\item{x}{
typically a predictor variable. \code{NA}s are allowed.
}
\item{y}{
a numeric outcome variable coded \code{0-1}. \code{NA}s are allowed.
}
\item{weights}{
a numeric vector of observation weights (usually frequencies).  Omit
or specify a zero-length vector to do an unweighted analysis.
}
\item{normwt}{
set to \code{TRUE} to make \code{weights} sum to the actual number of non-missing
observations.
}
\item{na.rm}{
set to \code{FALSE} to suppress checking for NAs.
}}
\value{
a vector with the named elements \code{C}, \code{Dxy}, \code{n} (number of non-missing
pairs), and \code{Missing}. Uses the formula 
\code{C = (mean(rank(x)[y == 1]) - (n1 + 1)/2)/(n - n1)}, where \code{n1} is the
frequency of \code{y=1}.
}
\details{
The \code{rcorr.cens} function, which although slower than \code{somers2} for large
sample sizes, can also be used to obtain Dxy for non-censored binary
\code{y}, and it has the advantage of computing the standard deviation of
the correlation index.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{rcorr.cens}}, \code{\link{rank}}, \code{\link{wtd.rank}}, 
}
\examples{
set.seed(1)
predicted <- runif(200)
dead      <- sample(0:1, 200, TRUE)
roc.area <- somers2(predicted, dead)["C"]
}
\keyword{nonparametric}
\concept{logistic regression model}
\concept{predictive accuracy}




\eof
\name{spower}
\alias{spower}
\alias{Quantile2}
\alias{print.Quantile2}
\alias{plot.Quantile2}
\alias{logrank}
\alias{Gompertz2}
\alias{Lognorm2}
\alias{Weibull2}
\title{
Simulate Power of 2-Sample Test for Survival under Complex Conditions
}
\description{
Given functions to generate random variables for survival times and censoring
times, \code{spower} simulates the power of a user-given 2-sample test for
censored data.  By default, the logrank (Cox 2-sample) test is used,
and a \code{logrank} function for comparing 2 groups is provided.  For
composing S-Plus functions to generate random survival times under
complex conditions, the \code{Quantile2} function allows the user to
specify the intervention:control hazard ratio as a function of time,
the probability of a control subject actually receiving the
intervention (dropin) as a function of time, and the probability that
an intervention subject receives only the control agent as a function of time
(non-compliance, dropout).  \code{Quantile2} returns a function that
generates either control or intervention uncensored survival times subject to
non-constant treatment effect, dropin, and dropout.  There is a \code{plot}
method for plotting the results of \code{Quantile2}, which will aid in
understanding the effects of the two types of non-compliance and
non-constant treatment effects.  \code{Quantile2} assumes that the hazard
function for either treatment group is a mixture of the control and
intervention hazard functions, with mixing proportions defined by the
dropin and dropout probabilities.  It computes hazards and survival
distributions by numerical differentiation and integration using a
grid of (by default) 7500 equally-spaced time points.

The \code{logrank} function is intended to be used with \code{spower}
but it can be used by itself as long as the \code{group} variable has
only the values \code{1} and \code{2} and there are no missing data.  It
returns the 1 degree of freedom chi-square statistic.

The \code{Weibull2} function accepts as input two vectors, one
containing two times and one containing two survival probabilities, and
it solves for the scale and shape parameters of the Weibull distribution
(\code{S(t)=exp(-alpha*t^ gamma)}) which will yield those estimates.  It
creates an S-Plus function to evaluate survival probabilities from this
Weibull distribution.  \code{Weibull2} is useful in creating functions
to pass as the first argument to \code{Quantile2}.

The \code{Lognorm2} and \code{Gompertz2} functions are similar to
\code{Weibull2} except that they produce survival functions for the
log-normal and Gompertz distributions.
}
\usage{
spower(rcontrol, rinterv, rcens, nc, ni, 
       test=logrank, nsim=500, alpha=0.05, pr=TRUE)

Quantile2(scontrol, hratio, 
          dropin=function(times)0, dropout=function(times)0,
          m=7500, tmax, qtmax=.001, mplot=200, pr=TRUE, \dots)

\method{print}{Quantile2}(x, \dots)

\method{plot}{Quantile2}(x, 
     what=c('survival','hazard','both','drop','hratio','all'),
     dropsep=FALSE, lty=1:4, col=1, xlim, ylim=NULL,
     label.curves=NULL, \dots)

logrank(S, group)

Gompertz2(times, surv)
Lognorm2(times, surv)
Weibull2(times, surv)


}
\arguments{
\item{rcontrol}{
a function of \code{n} which returns \code{n} random uncensored failure times for
the control group.  \code{spower} assumes that non-compliance (dropin) has
been taken into account by this function.
}
\item{rinterv}{
similar to \code{rcontrol} but for the intervention group
}
\item{rcens}{
a function of \code{n} which returns \code{n} random censoring times.  It is
assumed that both treatment groups have the same censoring distribution.
}
\item{nc}{
number of subjects in the control group
}
\item{ni}{
number in the intervention group
}
\item{scontrol}{
a function of a time vector which returns the survival probabilities
for the control group at those times assuming that all patients are compliant
}
\item{hratio}{
a function of time which specifies the intervention:control hazard
ratio (treatment effect)
}
\item{x}{
an object of class \code{"Quantile2"} created by \code{Quantile2}
}
\item{S}{
a \code{Surv} object or other two-column matrix for right-censored survival
times
}
\item{group}{
group indicators have length equal to the number of rows in \code{S}.  Only
values allowed are 1 and 2.
}
\item{times}{
a vector of two times
}
\item{surv}{
a vector of two survival probabilities
}
\item{test}{
any function of a \code{Surv} object and a grouping variable which computes
a chi-square for a two-sample censored data test.  The default is \code{logrank}.
}
\item{nsim}{
number of simulations to perform (default=500)
}
\item{alpha}{
type I error (default=.05)
}
\item{pr}{
set to \code{FALSE} to cause \code{spower} to suppress progress notes for
simulations.
Set to \code{FALSE} to prevent \code{Quantile2} from printing \code{tmax} when it
calculates \code{tmax}.
}
\item{dropin}{
a function of time specifying the probability that a control subject
actually becomes an intervention subject at the corresponding time
}
\item{dropout}{
a function of time specifying the probability of an intervention
subject dropping out to control conditions
}
\item{m}{
number of time points used for approximating functions (default is 7500)
}
\item{tmax}{
maximum time point to use in the grid of \code{m} times.  Default is the
time such that \code{scontrol(time)} is \code{qtmax}.
}
\item{qtmax}{
survival probability corresponding to the last time point used for
approximating survival and hazard functions.  Default is \code{.001}.  For
\code{qtmax} of the time for which a simulated time is needed which
corresponds to a survival probability of less than \code{qtmax}, the
simulated value will be \code{tmax}.
}
\item{mplot}{
number of points used for approximating functions for use in plotting
(default is 200 equally spaced points)
}
\item{...}{
optional arguments passed to the \code{scontrol} function when it's
evaluated by \code{Quantile2}
}
\item{what}{
a single character constant (may be abbreviated) specifying which
functions to plot.  The default is \code{"both"} meaning both survival and
hazard functions.  Specify \code{what="drop"} to just plot the dropin and
dropout functions, \code{what="hratio"} to plot the hazard ratio functions,
or \code{"all"} to make 4 separate plots showing all functions (6 plots if
\code{dropsep=TRUE}).
}
\item{dropsep}{
set \code{dropsep=TRUE} to make \code{plot.Quantile2} separate pure and
contaminated functions onto separate plots
}
\item{lty}{
vector of line types
}
\item{col}{
vector of colors
}
\item{xlim}{
optional x-axis limits
}
\item{ylim}{
optional y-axis limits
}
\item{label.curves}{
optional list which is passed as the \code{opts} argument to \code{labcurve}.
}}
\value{
\code{spower} returns the power estimate (fraction of simulated chi-squares
greater than the alpha-critical value).  \code{Quantile2} returns an S-Plus
function of class \code{"Quantile2"} with attributes that drive the \code{plot} method.  The major
attribute is a list containing several lists.  Each of these
sub-lists contains a \code{Time} vector along with one of the following:
survival probabilities for either treatment group and with or without
contamination caused by non-compliance, hazard rates in a similar way,
intervention:control hazard ratio function with and without
contamination, and dropin and dropout functions.  \code{logrank} returns a
single chi-square statistic, and \code{Weibull2}, \code{Lognorm2} and \code{Gompertz2}
return an S function with
three arguments, only the first of which (the vector of \code{times}) is
intended to be specified by the user.
}
\section{Side Effects}{
\code{spower} prints the interation number every 10 iterations if \code{pr=TRUE}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Lakatos E (1988): Sample sizes based on the log-rank statistic in complex
clinical trials.  Biometrics 44:229--241 (Correction 44:923).


Cuzick J, Edwards R, Segnan N (1997): Adjusting for non-compliance and 
contamination in randomized clinical trials. Stat in Med 16:1017--1029.

Cook, T (2003): Methods for mid-course corrections in clinical trials
with survival outcomes.  Stat in Med 22:3431--3447.
}
\seealso{
\code{\link{cpower}}, \code{\link{ciapower}}, \code{\link{bpower}}, \code{\link[Design]{cph}}, \code{\link[survival]{coxph}}, \code{\link{labcurve}}
}
\examples{
# Simulate a simple 2-arm clinical trial with exponential survival so
# we can compare power simulations of logrank-Cox test with cpower()
# Hazard ratio is constant and patients enter the study uniformly
# with follow-up ranging from 1 to 3 years
# Drop-in probability is constant at .1 and drop-out probability is
# constant at .175.  Two-year survival of control patients in absence
# of drop-in is .8 (mortality=.2).  Note that hazard rate is -log(.8)/2
# Total sample size (both groups combined) is 1000
# \% mortality reduction by intervention (if no dropin or dropout) is 25
# This corresponds to a hazard ratio of 0.7283 (computed by cpower)


cpower(2, 1000, .2, 25, accrual=2, tmin=1, 
       noncomp.c=10, noncomp.i=17.5)


ranfun <- Quantile2(function(x)exp(log(.8)/2*x),
                    hratio=function(x)0.7283156,
                    dropin=function(x).1,
                    dropout=function(x).175)


rcontrol <- function(n) ranfun(n, what='control')
rinterv  <- function(n) ranfun(n, what='int')
rcens    <- function(n) runif(n, 1, 3)


set.seed(11)   # So can reproduce results
spower(rcontrol, rinterv, rcens, nc=500, ni=500, 
       test=logrank, nsim=50)  # normally use nsim=500 or 1000


# Simulate a 2-arm 5-year follow-up study for which the control group's
# survival distribution is Weibull with 1-year survival of .95 and
# 3-year survival of .7.  All subjects are followed at least one year,
# and patients enter the study with linearly increasing probability  after that
# Assume there is no chance of dropin for the first 6 months, then the
# probability increases linearly up to .15 at 5 years
# Assume there is a linearly increasing chance of dropout up to .3 at 5 years
# Assume that the treatment has no effect for the first 9 months, then
# it has a constant effect (hazard ratio of .75)


# First find the right Weibull distribution for compliant control patients
sc <- Weibull2(c(1,3), c(.95,.7))
sc


# Inverse cumulative distribution for case where all subjects are followed
# at least a years and then between a and b years the density rises
# as (time - a) ^ d is a + (b-a) * u ^ (1/(d+1))


rcens <- function(n) 1 + (5-1) * (runif(n) ^ .5)
# To check this, type hist(rcens(10000), nclass=50)


# Put it all together


f <- Quantile2(sc, 
      hratio=function(x)ifelse(x<=.75, 1, .75),
      dropin=function(x)ifelse(x<=.5, 0, .15*(x-.5)/(5-.5)),
      dropout=function(x).3*x/5)


par(mfrow=c(2,2))
# par(mfrow=c(1,1)) to make legends fit
plot(f, 'all', label.curves=list(keys='lines'))


rcontrol <- function(n) f(n, 'control')
rinterv  <- function(n) f(n, 'intervention')


set.seed(211)
spower(rcontrol, rinterv, rcens, nc=350, ni=350, 
       test=logrank, nsim=50)  # normally nsim=500 or more
par(mfrow=c(1,1))
}
\keyword{htest}
\keyword{survival}
\concept{power}
\concept{study design}

\eof
\name{spss.get}
\alias{spss.get}
\title{Enhanced Importing of SPSS Files}
\description{
\code{spss.get} invokes the \code{read.spss} function in the
\code{foreign} package to read an SPSS file, with a default output
format of \code{"data.frame"}.  The \code{label} function is used to
attach labels to individual variables instead of to the data frame as
done by \code{read.spss}.  By default, integer-valued variables are
converted to a storage mode of integer unless
\code{force.single=FALSE}.  Date variables are converted to R POSIXct
variables (or S-Plus \code{chron} variables if \code{splusdates=TRUE}).
}
\usage{
spss.get(file, datevars = NULL, use.value.labels = TRUE, to.data.frame =
TRUE, max.value.labels = Inf, typeDate = c('POSIX','chron'), force.single=TRUE)
}
\arguments{
  \item{file}{input SPSS save file}
  \item{datevars}{vector of variable names containing dates to be
	converted to R or S-Plus internal format}
  \item{use.value.labels}{see \code{\link{read.spss}}}
  \item{to.data.frame}{see \code{\link{read.spss}}; default is
	\code{TRUE} for \code{spss.get}}
  \item{max.value.labels}{see \code{\link{read.spss}}}
  \item{typeDate}{set to \code{"chron"} to convert variables in
	\code{datevars} to \code{chron} objects (suitable for S-Plus)
	instead of the default of \code{"POSIX"}}
  \item{force.single}{set to \code{FALSE} to prevent integer-valued
	variables from being converted from storage mode \code{double} to
	\code{integer}}
}
\value{
  a data frame or list
}
\author{Frank Harrell}
\seealso{\code{\link{read.spss}},\code{\link{cleanup.import}},\code{\link{sas.get}}}

\examples{
\dontrun{
w <- spss.get('/tmp/my.sav', datevars=c('birthdate','deathdate'),
              typeDate='chron')
# Create a file S-Plus can read
dump('w', '/tmp/w')ds
# To read into S-Plus 6 do:
# library(Hmisc,T); source('/tmp/w'); w <- cleanup.import(w)
  }
}
\keyword{interface}
\keyword{manip}

\eof
\name{src}
\alias{src}
\title{Source a File from the Current Working Directory}
\description{
\code{src} concatenates \code{".s"} to its argument, quotes the result,
and \code{source}s in the file.  It sets \code{options(last.source)} to
this file name so that \code{src()} can be issued to re-\code{source}
the file when it is edited.
}
\usage{
src(x)
}
\arguments{
  \item{x}{an unquoted file name aside from \code{".s"}.  This base file
  name must be a legal S name.}
}
\section{Side Effects}{
  Sets system option \code{last.source}
}
\author{Frank Harrell}
\seealso{\code{\link{source}}}
\examples{
\dontrun{
src(myfile)   # source("myfile.s")
src()         # re-source myfile.s
}
}
\keyword{file}
\keyword{programming}
\keyword{utilities}

\eof
\name{store}
\alias{store}
\alias{stores}
\alias{storeTemp}
\title{
Store an Object Permanently
}
\description{
By default, \code{store} will copy the object to \code{.Data} under the
same name. 
This function is most useful when you have attached a data frame or a
temporary directory
in position 1.  \code{store} is also useful for setting up to store later
objects in a temporary work area (\code{.Data.tempnnnn}, where \code{nnnn} is a
number computed by the system) so that they are not stored on
disk.  For this usage, just invoke \code{store} with no arguments, i.e.,
\code{store()}.  After that, you can still invoke \code{store} with arguments
so that the object is copied to permanent storage.  Another function,
\code{stores} is useful for storing a series of temporary objects in
\code{.Data} with one call.  \code{store} and \code{stores} are not available
For \R.  See Details below for a method of approximating the use of
\code{store} in \R.

\code{storeTemp} stores an object in frame 0 for S-Plus or in a temporary
environment \code{.GlobalTemp} in \R, attaching that environment if it is
not already attached, so that the objects are easily available.
}
\usage{
store(object, name=as.character(substitute(object)),
      where=if (under.unix || .SV4.) ".Data" else "_Data")
stores(\dots)
storeTemp(object, name=deparse(substitute(object)))
}
\arguments{
\item{object}{
object to store (omit to set search list position one to a 
temporary directory created by \code{store})
}
\item{name}{
name under which to store the object. Default is name of object in
call to \code{store()}.
}
\item{where}{
directory in which to store object. Default is \code{.Data} underneath current
directory (for UNIX) or position 2 in the search list (for Windows).
For \R the default is \code{.GlobalEnv}.
}
\item{...}{
a list of objects to store in \code{.Data} or \code{.GlobalEnv} permanently,
using names which are the same as the argument names
}}
\section{Side Effects}{
uses \code{assign} and \code{attach} functions.  \code{store} with no arguments also
stores a function \code{.Last} in \code{.Data.tempnnnn}, which
will cause \code{.Data.tempnnnn} to be removed when the S session ends.
For S-Plus, \code{store()}
causes creation of a system option named \code{.store.temp} which contains
the name of the temporary directory created.
}
\seealso{
\code{\link{assign}}, \code{\link{.Last}}, \code{\link{attach}}, \code{\link{search}}
}
\details{
To almost mimic the functionality of \code{store} or \code{stores} in \R,
you can do the following.  Use \code{save(x,y,z,file="Permdata")} to save
permanent objects in \code{"permdata"}.  When you exit \R, do not save the
workspace.  Then all temporary objects will disappear.  In your
\code{.Rprofile} put the command \code{load("Permdata")} so that the next time
you invoke \R the permanent objects will be available.
}
\examples{
\dontrun{
attach(database, 1)     #this database takes precedence
store()                 #new objects to go under database in memory
                        #this doesn't work in R
f <- lm(y ~ x)
store(f)                #store f under name "f" in .Data or .GlobalEnv
                        #uses assign() with immediate=T
store(f, "final.model") #immediately store f under "final.model" in .Data
store()                 #store future objects in .Data.tempnnnn
x <- runif(1000)        #x will disappear at session end unless
store(x)                #this statement appears -> store in .Data
stores(x, y, z)         #store x,y,z in .Data under names x,y,z
storeTemp(x)            #put x as name 'x' in frame 0
                        #for R, attach .GlobalTemp and store it there
storeTemp(x,'X')        #same as previous but under the name X
}
}
\keyword{data}



\eof
\name{summarize}
\alias{summarize}
\alias{mApply}
\alias{asNumericMatrix}
\alias{matrix2dataFrame}
\alias{subsAttr}
\title{Summarize Scalars or Matrices by Cross-Classification}
\description{
\code{summarize} is a fast version of \code{summary(formula,
method="cross",overall=FALSE)} for producing stratified summary statistics
and storing them in a data frame for plotting (especially with trellis
\code{xyplot} and \code{dotplot} and Hmisc \code{xYplot}).  Unlike
\code{aggregate}, \code{summarize} accepts a matrix as its first
argument and a multi-valued \code{FUN}
argument and \code{summarize} also labels the variables in the new data
frame using their original names.  Unlike methods based on
\code{tapply}, \code{summarize} stores the values of the stratification
variables using their original types, e.g., a numeric \code{by} variable
will remain a numeric variable in the collapsed data frame.
\code{summarize} also retains \code{"label"} attributes for variables.
\code{summarize} works especially well with the Hmisc \code{xYplot}
function for displaying multiple summaries of a single variable on each
panel, such as means and upper and lower confidence limits.

\code{mApply} is like \code{tapply} except that the first argument can
be a matrix, and the output is cleaned up if \code{simplify=TRUE}.  It
uses code adapted from Tony Plate (\email{tplate@blackmesacapital.com}) to
operate on grouped submatrices.

As \code{mApply} can be much faster than using \code{by}, it is often
worth the trouble of converting a data frame to a numeric matrix for
processing by \code{mApply}.  \code{asNumericMatrix} will do this, and
\code{matrix2dataFrame} will convert a numeric matrix back into a data
frame if attributes and storage modes of the original variables are
saved by calling \code{subsAttr}.  \code{subsAttr} saves attributes that
are commonly preserved across row subsetting (i.e., it does not save
\code{dim}, \code{dimnames}, or \code{names} attributes).
}
\usage{
summarize(X, by, FUN, \dots, 
          stat.name=deparse(substitute(X)),
          type=c('variables','matrix'), subset=TRUE)

mApply(X, INDEX, FUN=NULL, \dots, simplify=TRUE)

asNumericMatrix(x)

subsAttr(x)

matrix2dataFrame(x, at, restoreAll=TRUE)
}
\arguments{
\item{X}{
a vector or matrix capable of being operated on by the
function specified as the \code{FUN} argument
}
\item{by}{
one or more stratification variables.  If a single
variable, \code{by} may be a vector, otherwise it should be a list.
Using the Hmisc \code{llist} function instead of \code{list} will result
in individual variable names being accessible to \code{summarize}.  For
example, you can specify \code{llist(age.group,sex)} or
\code{llist(Age=age.group,sex)}.  The latter gives \code{age.group} a
new temporary name, \code{Age}. 
}
\item{FUN}{
a function of a single vector argument, used to create the statistical
summaries for \code{summarize}.  \code{FUN} may compute any number of
statistics. 
}
\item{simplify}{set to \code{FALSE} to suppress simplification of the
  result in to an array, matrix, etc.}
\item{...}{extra arguments are passed to \code{FUN}}
\item{stat.name}{
the name to use when creating the main summary variable.  By default,
the name of the \code{X} argument is used.
}
\item{type}{
Specify \code{type="matrix"} to store the summary variables (if there are
more than one) in a matrix.
}
\item{subset}{
a logical vector or integer vector of subscripts used to specify the
subset of data to use in the analysis.  The default is to use all
observations in the data frame.
}
\item{INDEX}{
vector or list of vectors to cross-classify on, similar to \code{by}.
See \code{tapply}.}
\item{x}{
  a data frame (for \code{asNumericMatrix}) or a numeric matrix (for
  \code{matrix2dataFrame}).  For \code{subsAttr}, \code{x} may be a data
  frame, list, or a vector.
}
\item{at}{
  result of \code{subsAttr}
}
\item{restoreAll}{
  set to \code{FALSE} to only restore attributes \code{label},
  \code{units}, and \code{levels} instead of all attributes
}
}
\value{
For \code{summarize}, a data frame containing the \code{by} variables and the
statistical summaries (the first of which is named the same as the \code{X}
variable unless \code{stat.name} is given).  If \code{type="matrix"}, the
summaries are stored in a single variable in the data frame, and this
variable is a matrix.  For \code{mApply}, the returned value is a vector,
matrix, or list.  If \code{FUN} returns more than one number, the result
is an array if \code{simplify=TRUE} and is a list otherwise.  If a
matrix is returned, its rows correspond to unique combinations of
\code{INDEX}.  If \code{INDEX} is a list with more than one vector,
\code{FUN} returns more than one number, and \code{simplify=FALSE}, the
returned value is a list that is an array with the first dimension
corresponding to the last vector in \code{INDEX}, the second dimension
corresponding to the next to last vector in \code{INDEX}, etc., and the
elements of the list-array correspond to the values computed by
\code{FUN}.  In this situation the returned value is a regular array if
\code{simplify=TRUE}.   The order of dimensions is as previously but the
additional (last) dimension corresponds to values computed by
\code{FUN}.  \code{asNumericMatrix} returns a numeric matrix, and
\code{matrix2dataFrame} returns a data frame.  \code{subsAttr} returns a
list of attribute lists if its argument is a list or data frame, and a
list containing attributes of a single variable.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{label}}, \code{\link{cut2}}, \code{\link{llist}}, \code{\link{by}}
}
\examples{
\dontrun{
s <- summarize(ap>1, llist(size=cut2(sz, g=4), bone), mean,
               stat.name='Proportion')
dotplot(Proportion ~ size | bone, data=s7)
}

set.seed(1)
temperature <- rnorm(300, 70, 10)
month <- sample(1:12, 300, TRUE)
year  <- sample(2000:2001, 300, TRUE)
g <- function(x)c(Mean=mean(x,na.rm=TRUE),Median=median(x,na.rm=TRUE))
summarize(temperature, month, g)
mApply(temperature, month, g)

mApply(temperature, month, mean, na.rm=TRUE)
w <- summarize(temperature, month, mean, na.rm=TRUE)
if(.R.) library(lattice)
xyplot(temperature ~ month, data=w) # plot mean temperature by month

w <- summarize(temperature, llist(year,month), 
               quantile, probs=c(.5,.25,.75), na.rm=TRUE, type='matrix')
xYplot(Cbind(temperature[,1],temperature[,-1]) ~ month | year, data=w)
mApply(temperature, llist(year,month),
       quantile, probs=c(.5,.25,.75), na.rm=TRUE)

# Compute the median and outer quartiles.  The outer quartiles are
# displayed using "error bars"
set.seed(111)
dfr <- expand.grid(month=1:12, year=c(1997,1998), reps=1:100)
attach(dfr)
y <- abs(month-6.5) + 2*runif(length(month)) + year-1997
s <- summarize(y, llist(month,year), smedian.hilow, conf.int=.5)
s
mApply(y, llist(month,year), smedian.hilow, conf.int=.5)

xYplot(Cbind(y,Lower,Upper) ~ month, groups=year, data=s, 
       keys='lines', method='alt')
# Can also do:
s <- summarize(y, llist(month,year), quantile, probs=c(.5,.25,.75),
               stat.name=c('y','Q1','Q3'))
xYplot(Cbind(y, Q1, Q3) ~ month, groups=year, data=s, keys='lines')
# To display means and bootstrapped nonparametric confidence intervals
# use for example:
s <- summarize(y, llist(month,year), smean.cl.boot)
xYplot(Cbind(y, Lower, Upper) ~ month | year, data=s)

# For each subject use the trapezoidal rule to compute the area under
# the (time,response) curve using the Hmisc trap.rule function
x <- cbind(time=c(1,2,4,7, 1,3,5,10),response=c(1,3,2,4, 1,3,2,4))
subject <- c(rep(1,4),rep(2,4))
trap.rule(x[1:4,1],x[1:4,2])
summarize(x, subject, function(y) trap.rule(y[,1],y[,2]))

\dontrun{
# Another approach would be to properly re-shape the mm array below
# This assumes no missing cells.  There are many other approaches.
# mApply will do this well while allowing for missing cells.
m <- tapply(y, list(year,month), quantile, probs=c(.25,.5,.75))
mm <- array(unlist(m), dim=c(3,2,12), 
            dimnames=list(c('lower','median','upper'),c('1997','1998'),
                          as.character(1:12)))
# aggregate will help but it only allows you to compute one quantile
# at a time; see also the Hmisc mApply function
dframe <- aggregate(y, list(Year=year,Month=month), quantile, probs=.5)

# Compute expected life length by race assuming an exponential
# distribution - can also use summarize
g <- function(y) { # computations for one race group
  futime <- y[,1]; event <- y[,2]
  sum(futime)/sum(event)  # assume event=1 for death, 0=alive
}
mApply(cbind(followup.time, death), race, g)

# To run mApply on a data frame:
m <- mApply(asNumericMatrix(x), race, h)
# Here assume h is a function that returns a matrix similar to x
at <- subsAttr(x)  # get original attributes and storage modes
matrix2dataFrame(m, at)


# Get stratified weighted means
g <- function(y) wtd.mean(y[,1],y[,2])
summarize(cbind(y, wts), llist(sex,race), g, stat.name='y')
mApply(cbind(y,wts), llist(sex,race), g)

# Compare speed of mApply vs. by for computing 
d <- data.frame(sex=sample(c('female','male'),100000,TRUE),
                country=sample(letters,100000,TRUE),
                y1=runif(100000), y2=runif(100000))
g <- function(x) {
  y <- c(median(x[,'y1']-x[,'y2']),
         med.sum =median(x[,'y1']+x[,'y2']))
  names(y) <- c('med.diff','med.sum')
  y
}

system.time(by(d, llist(sex=d$sex,country=d$country), g))
system.time({
             x <- asNumericMatrix(d)
             a <- subsAttr(d)
             m <- mApply(x, llist(sex=d$sex,country=d$country), g)
            })
system.time({
             x <- asNumericMatrix(d)
             summarize(x, llist(sex=d$sex, country=d$country), g)
            })

# An example where each subject has one record per diagnosis but sex of
# subject is duplicated for all the rows a subject has.  Get the cross-
# classified frequencies of diagnosis (dx) by sex and plot the results
# with a dot plot

count <- rep(1,length(dx))
d <- summarize(count, llist(dx,sex), sum)
Dotplot(dx ~ count | sex, data=d)
}
detach('dfr')
}
\keyword{category}
\keyword{manip}
\concept{grouping}
\concept{stratification}
\concept{aggregation}
\concept{cross-classification}


\eof
\name{summary.formula}
\alias{summary.formula}
\alias{stratify}
\alias{print.summary.formula.response}
\alias{plot.summary.formula.response}
\alias{latex.summary.formula.response}
\alias{print.summary.formula.reverse}
\alias{plot.summary.formula.reverse}
\alias{latex.summary.formula.reverse}
\alias{[.summary.formula.response}
\alias{print.summary.formula.cross}
\alias{latex.summary.formula.cross}
\alias{formula.summary.formula.cross}
\alias{na.retain}
\alias{cumcategory}
\alias{mChoice}
\alias{as.character.mChoice}
\title{
Summarize Data for Making Tables and Plots
}
\description{
\code{summary.formula} summarizes the variables listed in an S-Plus formula,
computing descriptive statistics (including ones in a
user-specified function).  The summary statistics may be passed to
\code{print} methods, \code{plot} methods for making annotated dot charts, and
\code{latex} methods for typesetting tables using LaTeX. 
\code{summary.formula} has three methods for computing descriptive
statistics on univariate or multivariate responses, subsetted by
categories of other variables.  The method of summarization is
specified in the parameter \code{method} (see details below).  For the
\code{response} and \code{cross} methods, the statistics used to
summarize the data 
may be specified in a very flexible way (e.g., the geometric mean,
33rd percentile, Kaplan-Meier 2-year survival estimate, mixtures of
several statistics).  The default summary statistic for these methods
is the mean (the proportion of positive responses for a binary
response variable).  The \code{cross} method is useful for creating data
frames which contain summary statistics that are passed to \code{trellis}
as raw data (to make multi-panel dot charts, for example).  The
\code{print} methods use the \code{print.char.matrix} function to print boxed
tables, if it is available (it is included in S-Plus versions 3.2 and
later).


For \code{method="response"} and \code{method="reverse"} the right hand side of
\code{formula} may contain "multiple choice" variables.  These are denoted
by matrices whose elements are logical (\code{FALSE},\code{TRUE}) values, 0/1 values,
or character strings in which values of \code{"present"} or \code{"yes"} (case
is ignored) denote positive and anything else denotes a negative
answer.  The columns of such matrices correspond to basic categories
(e.g., symptoms), and the matrices are often created by applying the
\code{mChoice} function to a series of factor or character vectors.  See
the first example.  When \code{test=TRUE} each choice is tested separately as
a binary categorical response.


The \code{plot} method for \code{method="reverse"} creates a temporary
function \code{Key} in frame 0 as is done by the \code{xYplot} and
\code{ecdf.formula} functions.  After \code{plot} runs, you can type
\code{Key()} to put a legend in a default location, or
e.g. \code{Key(locator(1))} to draw a legend where you click the left
mouse button.  This key is for categorical variables, so to have the
opportunity to put the key on the graph you will probably want to use
the command \code{plot(object, which="categorical")} [Note however that
in Windows S-Plus you can switch back and forth between multiple pages
on a graph sheet, and issue a \code{Key()} or \code{Key2()} command
according to which graph sheet page is active.].  A second function
\code{Key2} is created if continuous variables are being plotted.  It is
used the same as \code{Key}.  If the \code{which} argument is not
specified to \code{plot}, two pages of plots will be produced.  If you
don't define \code{par(mfrow=)} yourself,
\code{plot.summary.formula.reverse} will try to lay out a multi-panel
graph to best fit all the individual dot charts for continuous
variables.

There is a subscripting method for objects created with
\code{method="response"}. 
This can be used to print or plot selected variables or summary statistics
where there would otherwise be too many on one page.

\code{cumcategory} is a utility function useful when summarizing an ordinal
response variable.  It converts such a variable having \code{k} levels to a
matrix with \code{k-1} columns, where column \code{i} is a vector of zeros and
ones indicating that the categorical response is in level \code{i+1} or
greater.  When the left hand side of \code{formula} is \code{cumcategory(y)},
the default \code{fun} will summarize it by computing all of the relevant
cumulative proportions.

\code{mChoice} is a function that is useful for defining a group of
variables on the right side of the formula.  The variables can represent
individual choices on a multiple choice question.  These choices are
typically factor or character values but may be of any type.  Levels
of component factor variables need not be the same; all unique levels
(or unique character values) are collected over all of the multiple
variables.  Then a new matrix is formed that has one column per unique
value of all of these variables.  For each column, the row values are
logical \code{TRUE} or \code{FALSE} if any of the component choice variables equal
level for the new matrice's current column.  By default, NAs in the
choice variables are ignored.  Set \code{na.result=TRUE} to set results to NA
for a row and column where at least one of the choice
variables is NA but none of them equals the current column category.
When a matrix like one created by \code{mChoice} appears in a formula
processed by \code{summary.formula} you can easily obtain descriptive
statistics for categories where subjects can be in more than one
category.

\code{as.character.mChoice} will convert a matrix representing an
\code{mChoice} object into a character vector by concatenating the
categories present per observation.  This makes \code{summarize} work
when stratifying by \code{mChoice} variables.
}
\synopsis{
\method{summary}{formula}(formula, data, subset, na.action, 
           fun=NULL,
           method=c('response','reverse','cross'),
           overall=method=='response'|method=='cross', 
           continuous=10, na.rm=method=='reverse', g=4, 
           quant=c(.025,.05,.125,.25,.375,.5,.625,.75,.875,.95,.975),
           nmin=0, test=FALSE,
           conTest=function(group,x) {
             st <- spearman2(group,x)
             list(P=st['P'], stat=st['F'],
                  df=st[c('df1','df2')],
                  testname=if(st['df1']==1)'Wilcoxon' else
                  'Kruskal-Wallis',
                  statname='F', latexstat='F_{df}',
                  plotmathstat='F[df]')
           },
           catTest=function(tab) {
             st <- if(!is.matrix(tab) || nrow(tab) < 2)
               list(p.value=NA, statistic=NA, parameter=NA) else
             chisq.test(tab, correct=FALSE)
             list(P=st$p.value, stat=st$statistic,
                  df=st$parameter,
                  testname='Pearson', statname='Chi-square',
                  latexstat='\\chi^{2}_{df}',
                  plotmathstat='chi[df]^2')
           }, \dots)
}
\usage{
\method{summary}{formula}(formula, data, subset, na.action, fun, method='response',
              overall=TRUE, continuous=10, na.rm=FALSE, g=4, nmin=0, \dots)
\method{print}{summary.formula.response}(x, vnames=c('labels','names'), prUnits=TRUE,
      abbreviate.dimnames=FALSE,
      prefix.width, min.colwidth, formatArgs, \dots)
\method{plot}{summary.formula.response}(x, which=1, vnames=c('labels','names'), xlim, xlab, 
     pch=c(16,1,2,17,15,3,4,5,0), superposeStrata=TRUE,
     dotfont=1, add=FALSE, main, subtitles=TRUE, xaxis=TRUE,
     \dots)
\method{latex}{summary.formula.response}(object, title=first.word(expr=substitute(object)),
      caption, trios, vnames=c('labels','names'), prUnits=TRUE,
      rowlabel='', cdec=2, ncaption=FALSE,
      \dots)
x[i,j]


\method{summary}{formula}(formula, data, subset, na.action, method='reverse',
              overall=FALSE, continuous=10, na.rm=TRUE,
              quant=c(0.025, 0.05, 0.125, 0.25, 0.375, 0.5,
                      0.625, 0.75, 0.875, 0.95, 0.975),
              test=FALSE,
              conTest=function(group,x) {
                 st <- spearman2(group,x)
                 list(P=st['P'], stat=st['F'], df=st[c('df1','df2')],
                      testname=if(st['df1']==1)'Wilcoxon' else
                                            'Kruskal-Wallis',
                      statname='F', latexstat='F_{df}', plotmathstat='F[df]')
                 },
              catTest=function(tab) {
                 st <- if(!is.matrix(tab) || nrow(tab) < 2)
                   list(p.value=NA, statistic=NA, parameter=NA) else
                   chisq.test(tab, correct=FALSE)
                 list(P=st$p.value, stat=st$statistic,
                      df=st$parameter,
                      testname='Pearson', statname='Chi-square',
                      latexstat='\\chi^{2}_{df}', plotmathstat='chi[df]^2')
              })

\method{print}{summary.formula.reverse}(x, digits, prn=!all(n==N), pctdig=0,
      npct=c('numerator','both','denominator','none'),
      exclude1=TRUE, vnames=c('labels','names'), prUnits=TRUE, sep='/',
      abbreviate.dimnames=FALSE, prefix.width=max(nchar(lab)),
      min.colwidth, formatArgs,  prtest=c('P','stat','df','name'),
      prmsd=FALSE, long=FALSE, pdig=3, eps=.001, \dots)

\method{plot}{summary.formula.reverse}(x, vnames=c('labels','names'), what=c('proportion','\%'),
  which=c('both','categorical','continuous'),
  xlim=if(what=='proportion') c(0,1) else c(0,100), 
  xlab=if(what=='proportion')'Proportion' else 'Percentage', 
  pch=c(16,1,2,17,15,3,4,5,0), exclude1=TRUE,
  dotfont=1, main, subtitles=TRUE,
  prtest=c('P','stat','df','name'), pdig=3, eps=.001,
  conType=c('dot','bp'), cex.means=.5, \dots)

\method{latex}{summary.formula.reverse}(object,
     title=first.word(expr=substitute(object)), digits,
     prn=!all(n==N), pctdig=0,
     npct=c('numerator','both','denominator','none'),
     npct.size='scriptsize', Nsize='scriptsize',
     exclude1=TRUE, vnames=c('labels','names'),
     middle.bold=FALSE, outer.size='scriptsize',
     caption, rowlabel='', insert.bottom=TRUE, dcolumn=FALSE,
     prtest=c('P','stat','df','name'),
     prmsd=FALSE, msdsize=NULL, long=FALSE, pdig=3, eps=.001, \dots)


\method{summary}{formula}(formula, data, subset, na.action, fun, method='cross',
              overall=TRUE, continuous=10, g=4, \dots)
\method{print}{summary.formula.cross}(x, twoway=nvar==2, prnmiss=any(x$Missing>0), prn=TRUE,
      abbreviate.dimnames=FALSE, prefix.width=max(nchar(v)), 
      min.colwidth, formatArgs, \dots) 
\method{latex}{summary.formula.cross}(object,
      title=first.word(expr=substitute(object)),
      twoway=nvar==2, prnmiss=TRUE, prn=TRUE,
      caption=attr(object,'heading'),
      vnames=c('labels','names'), rowlabel='', \dots)


stratify(\dots, na.group=FALSE, shortlabel=TRUE)

\method{formula}{summary.formula.cross}(x, ...)

cumcategory(y)


mChoice(\dots, label='', 
        sort.levels=c('original','alphabetic'), 
        add.none=TRUE, none.name='none', na.result=FALSE, drop=TRUE)

\method{as.character}{mChoice}(x)
}
\arguments{
\item{formula}{
An S formula with additive effects.  For \code{method="response"} or
\code{"cross"}, the dependent variable has the usual connotation.  For
\code{method="reverse"}, the dependent variable is what is usually thought
of as an independent variable, and it is one that is used to stratify
all of the right hand side variables.  For \code{method="response"}
(only), the \code{formula} may contain one or more invocations of the
\code{stratify} function whose arguments are defined below.  This causes
the entire analysis to be stratified by cross-classifications of the
combined list of stratification factors.  This stratification will be
reflected as major column groupings in the resulting table, or as more
response columns for plotting.  If \code{formula} has no dependent variable
\code{method="reverse"} is the only legal value and so \code{method} defaults to
\code{"reverse"} in this case.
}
\item{x}{an object created by \code{summary.formula} or \code{mChoice}}
\item{y}{
a numeric, character, category, or factor vector for \code{cumcategory}.
Is converted to a categorical variable is needed.
}
\item{data}{
name or number of a data frame.  Default is the current frame.
}
\item{subset}{
a logical vector or integer vector of subscripts used to specify the
subset of data to use in the analysis.  The default is to use all
observations in the data frame.
}
\item{na.action}{
function for handling missing data in the input data.  The default is
a function defined here called \code{na.retain}, which keeps all
observations for processing, with missing variables or not.
}
\item{fun}{
function for summarizing data in each cell.  Default is to take the
mean of each column of the possibly multivariate response variable.
You can specify \code{fun="\%"} to compute percentages (100 times the mean of a 
series of logical or binary variables).
User--specified functions can also return a matrix.  For example, you might 
compute quartiles on a bivariate response.
}
\item{method}{
The default is \code{"response"}, in which case the response variable may
be multivariate and any number of statistics may be used to summarize
them.  Here the responses are summarized separately for each of any
number of independent variables.  Continuous independent variables
(see the \code{continuous} parameter below) are automatically stratified
into \code{g} (see below) quantile groups (if you want to control the
discretization for selected variables, use the \code{cut2} function on them).  
Otherwise, the data are
subsetted by all levels of discrete right hand side variables.  For
multivariate responses, subjects are considered to be missing if any
of the columns is missing.  


The \code{method="reverse"} option is
typically used to make baseline characteristic tables, for example.
The single left hand side variable must be categorical (e.g.,
treatment), and the right hand side variables are broken down one at a
time by the "dependent" variable.  Continuous variables are described
by three quantiles (quartiles by default) along with 
outer quantiles (used only for scaling x-axes when plotting quartiles;
all are used when plotting box-percentile plots), and
categorical ones are
described by counts and percentages.  If there is no left hand side
variable, \code{summary} assumes that there is only one group in the data,
so that only one column of summaries will appear.
If there is no dependent variable in \code{formula}, \code{method} defaults to
\code{"reverse"} automatically.


The \code{method="cross"} option allows for a multivariate dependent
variable and for up to three independents.  Continuous independent
variables (those with at least \code{continuous} unique values) are
automatically divided into \code{g} quantile groups.
The independents are cross-classified, and marginal statistics may optionally be computed.
The output of \code{summary.formula} in this case is a data frame
containing the independent variable combinations (with levels of
\code{"All"} corresponding to marginals) and the corresponding summary
statistics in the matrix \code{S}.  The output data frame is suitable for
direct use in \code{trellis}.  The \code{print} and \code{latex} typesetting methods for this
method allows for a special two-way format if there are two right
hand variables.
}
\item{overall}{
For \code{method="reverse"}, setting \code{overall=TRUE} makes a new column with
overall statistics for the whole sample.  For \code{method="cross"},
\code{overall=TRUE} (the default) results in all marginal statistics being
computed.  For \code{trellis} displays (usually multi-panel dot plots), 
these marginals just form other categories.  For \code{"response"}, the
default is \code{overall=TRUE}, causing a final row of global summary
statistics to appear in tables and dot charts.  If \code{test=TRUE} these
marginal statistics are ignored in doing statistical tests.
}
\item{continuous}{
specifies the threshold for when a variable is considered to be
continuous (when there are at least \code{continuous} unique values).
\code{factor} variables are always considered to be categorical no matter
how many levels they have.
}
\item{na.rm}{
for \code{method="response"}, set \code{na.rm=TRUE} to exclude missing values from
being counted as their own category when subsetting the response(s)
by levels of a categorical variable.  For \code{method="reverse"} set
\code{na.rm=FALSE} to keep missing values of categorical variables from
being excluded from the table.  \code{na.rm} also applies to summary
statistic functions such as \code{smean.cl.normal}.  For these \code{na.rm}
defaults to \code{TRUE} unlike built-in S-Plus functions.
}
\item{g}{
number of quantile groups to use when variables are automatically
categorized with \code{method="response"} or \code{"cross"} using \code{cut2}
}
\item{nmin}{
if fewer than \code{nmin} observations exist in a category for \code{"response"}
(over all strata combined), that category will be ignored
}
\item{test}{
applies if \code{method="reverse"}.  Set to \code{TRUE} to compute test
statistics using tests specified in \code{conTest} and \code{catTest}.
}
\item{conTest}{
a function of two arguments (grouping variable and a continuous
variable) that returns a list with components \code{P} (the computed
P-value), \code{stat} (the test statistic, either chi-square or F),
\code{df} (degrees of freedom), \code{testname} (test name), \code{statname}
(statistic name), an optional component \code{latexstat} (LaTeX
representation of \code{statname}), an optional component
\code{plotmathstat} (for R - the \code{plotmath} representation of
\code{statname}, as a character string),  and an
optional component \code{note} 
that contains a character string note about the test (e.g., \code{"test not
done because n < 5"}).  \code{conTest} is applied to continuous variables
on the right-hand-side of the formula when \code{method="reverse"}.  The
default uses the \code{spearman2} function to run the Wilcoxon or
Kruskal-Wallis test using the F distribution.
}
\item{catTest}{
a function of a frequency table (an integer matrix) that returns a
list with the same components as created by \code{conTest}.  By default,
the Pearson chi-square test is done, without continuity correction
(the continuity correction would make the test conservative like the
Fisher exact test).
}
\item{...}{
for \code{summary.formula} these are optional
arguments for \code{cut2} when variables are automatically categorized.
For \code{plot} methods these arguments are passed to \code{dotchart2}.
For \code{Key} and \code{Key2} these arguments are passed to \code{key},
\code{text}, or \code{mtitle}.  For \code{print} methods these are
optional arguments to \code{print.char.matrix}. For \code{latex} methods
these are passed to \code{latex.default}.  One of the most important of
these is \code{file}.  Specifying \code{file=""} will cause LaTeX code
to just be printed to standard output rather than be stored in a
permanent file.
Is a series of vectors for \code{mChoice}, or one or more stratification
variables (separated by commas) (which will be cross-classified) for
\code{stratify}.
}
\item{object}{an object created by \code{summary.formula}}
\item{quant}{
vector of quantiles to use for summarizing data with
\code{method="reverse"}.  This must be numbers between 0 and 1
inclusive and must include the numbers 0.5, 0.25, and 0.75 which are
used for printing and for plotting 
quantile intervals.  The outer quantiles are used for scaling the x-axes
for such plots.  Specify outer quantiles as \code{0} and \code{1} to
scale the x-axes using the whole observed data ranges instead of the
default (a 0.95 quantile interval).  Box-percentile plots are drawn
using all but the outer quantiles.
}
\item{vnames}{
By default, tables and plots are usually labeled with variable labels
(see the \code{label} and \code{sas.get} functions).  To use the shorter
variable names, specify \code{vnames="name"}.
}
\item{pch}{
  vector of plotting characters to represent different groups, in order
  of group levels.  For \code{method="response"} the characters
  correspond to levels of the \code{stratify} variable if
  \code{superposeStrata=TRUE}, and if no 
  \code{strata} are used or if \code{superposeStrata=FALSE}, the 
  \code{pch} vector corresponds to the \code{which} argument for
  \code{method="response"}.
}
\item{superposeStrata}{
  If \code{stratify} was used, set \code{superposeStrata=FALSE} to make
  separate dot charts for each level of the \code{stratification}
  variable, for \code{method='response'}.  The default is to
  superposition all strata on one dot chart.
}
\item{dotfont}{font for plotting points}
\item{xaxis}{set to \code{FALSE} to suppress drawing x-axis in
  \code{plot.summary.formula.response}}

\item{abbreviate.dimnames}{see \code{print.char.matrix}}
\item{prefix.width}{see \code{print.char.matrix}}
\item{min.colwidth}{
minimum column width to use for boxes printed with \code{print.char.matrix}.
The default is the maximum of the minimum column label length and the minimum
length of entries in the data cells.
}
\item{formatArgs}{
a list containing other arguments to pass to \code{format.default} such as
\code{scientific}, e.g., \code{formatArgs=list(scientific=c(-5,5))}.  For
\code{print.summary.formula.reverse}, \code{formatArgs} applies only to
statistics computed on continuous variables, not to percents,
numerators, and denominators.
}
\item{digits}{
number of significant digits to print.  Default is to use the current
value of the \code{digits} system option.
}
\item{prn}{
set to \code{TRUE} to print the number of non-missing observations on the
current (row) variable.  The default is to print these only if any of
the counts of non-missing values differs from the total number of
non-missing values of the left-hand-side variable.
For \code{method="cross"} the default is to always print \code{N}.
}
\item{prnmiss}{
set to \code{FALSE} to suppress printing counts of missing values for \code{"cross"}
}
\item{pctdig}{
number of digits to the right of the decimal place for printing
percentages. The default is zero, so percents will be rounded to the
nearest percent.
}
\item{npct}{
specifies which counts are to be printed to the right of percentages.
The default is to print the frequency (numerator of the percent) in
parentheses.  You can specify \code{"both"} to print both numerator and
denominator, \code{"denominator"}, or \code{"none"}.
}
\item{npct.size}{
the size for typesetting \code{npct} information which appears after percents.
The default is \code{"scriptsize"}.
}
\item{Nsize}{
  When a second row of column headings is added showing sample sizes,
  \code{Nsize} specifies the LaTeX size for these subheadings.  Default
  is \code{"scriptsize"}.
  }
\item{exclude1}{
by default, \code{method="reverse"} objects will be printed, plotted,  or typeset by
removing redundant entries from percentage tables for categorical
variables.  For example, if you print the percent of females, you
don't need to print the percent of males.  To override this, set \code{exclude1=FALSE}.
}
\item{prUnits}{
  set to \code{FALSE} to suppress printing or latexing \code{units}
  attributes of variables, when \code{method='reverse'} or \code{'response'}
  }
\item{sep}{
character to use to separate quantiles when printing
\code{method="reverse"} tables
}
\item{prtest}{
a vector of test statistic components to print if \code{test=TRUE} was in
effect when \code{summary.formula} was called.  Defaults to printing all
components.  Specify \code{prtest=FALSE} or \code{prtest="none"} to not
print any tests.  This applies to \code{print}, \code{latex}, and
\code{plot} methods for \code{method='reverse'}.
}
\item{prmsd}{
  set to \code{TRUE} to print mean and SD after the three quantiles, for
  continuous variables with \code{method="reverse"}
}
\item{msdsize}{
  defaults to \code{NULL} to use the current font size for the mean and
  standard deviation if \code{prmsd} is \code{TRUE}.  Set to a character
  string to specify an alternate LaTeX font size.
  }
\item{long}{
  set to \code{TRUE} to print the results for the first category on its own
  line, not on the same line with the variable label (for
  \code{method="reverse"} with \code{print} and \code{latex} methods)
}
\item{pdig}{
  number of digits to the right of the decimal place for printing
  P-values.  Default is \code{3}.  This is passed to \code{format.pval}.
}
\item{eps}{
  P-values less than \code{eps} will be printed as \code{< eps}.  See
  \code{format.pval}.
  }
\item{what}{
for \code{method="reverse"} specifies whether proportions or percentages
are to be plotted
}
\item{twoway}{
for \code{method="cross"} with two right hand side variables, \code{twoway}
controls whether the resulting table will be printed in enumeration
format or as a two-way table (the default)
}
\item{which}{
For \code{method="response"} specifies the sequential number or a vector of
subscripts of response variables to plot.  If you had any \code{stratify}
variables, these are counted as if multiple response variables were
analyzed.  For \code{method="reverse"} specifies whether to plot results
for categorical variables, continuous variables, or both (the default).
}
\item{conType}{
  For plotting \code{method="reverse"} plots for continuous variables,
  dot plots showing quartiles are drawn by default.  Specify
  \code{conType='bp'} to draw box-percentile plots using all the
  quantiles in \code{quant} except the outermost ones.  Means are drawn
  with a solid dot and vertical reference lines are placed at the three
  quartiles.}
\item{cex.means}{
character size for means in box-percentile plots; default is .5}
\item{xlim}{
vector of length two specifying x-axis limits.  For
\code{method="reverse"}, this is only used for plotting categorical
variables.  Limits for continuous variables are determined by the
outer quantiles specified in \code{quant}.
}
\item{xlab}{
x-axis label
}
\item{add}{
set to \code{TRUE} to add to an existing plot
}
\item{main}{
a main title.  For \code{method="reverse"} this applies only to the plot
for categorical variables.
}
\item{subtitles}{
set to \code{FALSE} to suppress automatic subtitles
}
\item{label}{
a character string \code{label} attribute to attach to the matrix created
by \code{mChoice}
}
\item{sort.levels}{
set \code{sort.levels="alphabetic"} to sort the columns of the matrix
created by \code{mChoice} alphabetically by category rather than by the
original order of levels in component factor variables (if there were
any input variables that were factors)
}
\item{add.none}{
set to \code{FALSE} to keep \code{mChoice} from adding a final column to the
matrix named \code{none.name}.  The logical values in this column are
set to \code{TRUE} when none of the defined choices apply for the
observation and \code{FALSE} otherwise. If every observation used at least
one of the choices, the "none" column is not added regardless of the
value of \code{add.none}. 
}
\item{none.name}{
a character string defining the name of the column added if
\code{add.none=TRUE} and some observations did not select any choices.  The
default column name is \code{none.name="none"}.
}
\item{na.result}{
set to \code{TRUE} to set elements of columns of the matrix computed by
\code{mChoice} to \code{NA} when no input variable values equalled the current
category and at least one of them was NA
}
\item{drop}{
set \code{drop=FALSE} to keep unused factor levels as columns of the matrix
produced by \code{mChoice}
}
\item{caption}{
character string containing LaTeX table captions.
}
\item{title}{
name of resulting LaTeX file omitting the \code{.tex} suffix.  Default
is the name of the \code{summary} object.  If \code{caption} is specied,
\code{title} is also used for the table's symbolic reference label. 
}
\item{trios}{
If for \code{method="response"} you summarized the response(s) by using
three quantiles, specify \code{trios=TRUE} or \code{trios=v} to group each set of
three statistics into one column for \code{latex} output, using the format
a B c, where the outer quantiles are in smaller font
(\code{scriptsize}).  For \code{trios=TRUE}, the overall column names are taken
from the column names of the original data matrix.  To give new
column names, specify \code{trios=v}, where \code{v} is a vector of column
names, of length \code{m/3}, where \code{m} is the original number of columns
of summary statistics.
}
\item{rowlabel}{
see \code{latex.default} (under the help file \code{latex})
}
\item{cdec}{
number of decimal places to the right of the decimal point for
\code{latex}.  This value should be a scalar (which will be properly
replicated), or a vector with length equal to the number of columns
in the table.  For \code{"response"} tables, this length does not count
the column for \code{N}.
}
\item{ncaption}{
  set to \code{FALSE} to not have \code{latex.summary.formula.response}
  put sample sizes in captions
  }
\item{i}{
a vector of integers, or character strings containing variable names
to subset on.  Note that each row subsetted on in an \code{summary.formula.reverse}
object subsets on all the levels that make up the corresponding variable
(automatically).
}
\item{j}{
a vector of integers representing column numbers
}
\item{middle.bold}{
set to \code{TRUE} to have LaTeX use bold face for the middle quantile for
\code{method="reverse"} 
}
\item{outer.size}{
the font size for outer quantiles for \code{"reverse"} tables
}
\item{insert.bottom}{
  set to \code{FALSE} to suppress inclusion of definitions placed at the
  bottom of LaTeX tables for \code{method="reverse"}
  }
\item{dcolumn}{
  see \code{latex}
  }
\item{na.group}{
set to \code{TRUE} to have missing stratification variables given their own
category (\code{NA})
}
\item{shortlabel}{
set to \code{FALSE} to include stratification variable names and equal signs
in labels for strata levels
}
}
\value{
\code{summary.formula} returns a data frame or list depending on
\code{method}.  \code{plot.summary.formula.reverse} returns the number
of pages of plots that were made.
}
\section{Side Effects}{
\code{plot.summary.formula.reverse} creates a function \code{Key} and
\code{Key2} in frame 0 that will draw legends.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\references{
Harrell FE (1999): Statistical tables and plots using S-Plus and LaTeX.
Document available from hesweb1.med.virginia.edu/biostat/s.
}
\seealso{
\code{\link{smean.sd}}, \code{\link{summarize}},
\code{\link{label}}, \code{\link{strata}}, \code{\link{dotchart2}},
\code{\link{print.char.matrix}}, \code{\link{update}},
\code{\link{formula}}, \code{\link{cut2}}, \code{\link{llist}},
\code{\link{format.default}}, \code{\link{latex}},
\code{\link{latexTranslate}} \code{\link{bpplt}}
}
\examples{
options(digits=3)
set.seed(173)
sex <- factor(sample(c("m","f"), 500, rep=TRUE))
age <- rnorm(500, 50, 5)
treatment <- factor(sample(c("Drug","Placebo"), 500, rep=TRUE))


# Generate a 3-choice variable; each of 3 variables has 5 possible levels
symp <- c('Headache','Stomach Ache','Hangnail',
          'Muscle Ache','Depressed')
symptom1 <- sample(symp, 500,TRUE)
symptom2 <- sample(symp, 500,TRUE)
symptom3 <- sample(symp, 500,TRUE)
Symptoms <- mChoice(symptom1, symptom2, symptom3, label='Primary Symptoms')
table(as.character(Symptoms))

# Note: In this example, some subjects have the same symptom checked
# multiple times; in practice these redundant selections would be NAs
# mChoice will ignore these redundant selections
# If the multiple choices to a single survey question were already
# stored as a series of T/F yes/no present/absent questions we could do:
# Symptoms <- cbind(headache,stomach.ache,hangnail,muscle.ache,depressed)
# where the 5 input variables are all of the same type: 0/1,logical,char.
# These variables cannot be factors in this case as cbind would
# store integer codes instead of character strings.
# To give better column names can use 
# cbind(Headache=headache, 'Stomach Ache'=stomach.ache, \dots)


# Following 8 commands only for checking mChoice
data.frame(symptom1,symptom2,symptom3)[1:10,]
Symptoms[1:10,]  # Print first 10 subjects' new binary indicators


meanage <- if(.R.)double(5) else single(5)
for(j in 1:5) meanage[j] <- mean(age[Symptoms[,j]])
names(meanage) <- dimnames(Symptoms)[[2]]
meanage


# Manually compute mean age for 2 symptoms
mean(age[symptom1=='Headache' | symptom2=='Headache' | symptom3=='Headache'])
mean(age[symptom1=='Hangnail' | symptom2=='Hangnail' | symptom3=='Hangnail'])


#Frequency table sex*treatment, sex*Symptoms
summary(sex ~ treatment + Symptoms, fun=table)
# could also do summary(sex ~ treatment + mChoice(symptom1,\dots),\dots)


#Compute mean age, separately by 3 variables
summary(age ~ sex + treatment + Symptoms)


summary(age ~ sex + treatment, method="cross")
# Note: method="cross" will not allow mChoice variables


f <- summary(treatment ~ age + sex + Symptoms, method="reverse", test=TRUE)
f
# trio of numbers represent 25th, 50th, 75th percentile
print(f, long=TRUE)
plot(f)
plot(f, conType='bp', prtest='P')
bpplt()    # annotated example showing layout of bp plot

#Compute predicted probability from a logistic regression model
#For different stratifications compute receiver operating
#characteristic curve areas (C-indexes)
predicted <- plogis(.4*(sex=="m")+.15*(age-50))
positive.diagnosis <- ifelse(runif(500)<=predicted, 1, 0)
roc <- function(z) {
   x <- z[,1];
   y <- z[,2];
   n <- length(x);
   if(n<2)return(c(ROC=NA));
   n1 <- sum(y==1);
   c(ROC= (mean(rank(x)[y==1])-(n1+1)/2)/(n-n1) );
 }
y <- cbind(predicted, positive.diagnosis)
options(digits=2)
summary(y ~ age + sex, fun=roc)


options(digits=3)
summary(y ~ age + sex, fun=roc, method="cross")


#Plot estimated mean life length (assuming an exponential distribution) 
#separately by levels of 4 other variables.  Repeat the analysis
#by levels of a stratification variable, drug.  Automatically break
#continuous variables into tertiles.
#We are using the default, method='response'
\dontrun{
life.expect <- function(y) c(Years=sum(y[,1])/sum(y[,2]))
attach(pbc)
S <- Surv(follow.up.time, death)
s2 <- summary(S ~ age + albumin + ascites + edema + stratify(drug),
                         fun=life.expect, g=3)


#Note: You can summarize other response variables using the same 
#independent variables using e.g. update(s2, response~.), or you 
#can change the list of independent variables using e.g. 
#update(s2, response ~.- ascites) or update(s2, .~.-ascites)
#You can also print, typeset, or plot subsets of s2, e.g.
#plot(s2[c('age','albumin'),]) or plot(s2[1:2,])


s2    # invokes print.summary.formula.response


#Plot results as a separate dot chart for each of the 3 strata levels
par(mfrow=c(2,2))
plot(s2, cex.labels=.6, xlim=c(0,40), superposeStrata=FALSE)


#Typeset table, creating s2.tex
w <- latex(s2, cdec=1)
#Typeset table but just print LaTeX code
latex(s2, file="")    # useful for Sweave


#Take control of groups used for age.  Compute 3 quartiles for
#both cholesterol and bilirubin (excluding observations that are missing
#on EITHER ONE)


age.groups <- cut2(age, c(45,60))
g <- function(y) apply(y, 2, quantile, c(.25,.5,.75))
y <- cbind(Chol=chol,Bili=bili)
label(y) <- 'Cholesterol and Bilirubin'
#You can give new column names that are not legal S-Plus names
#by enclosing them in quotes, e.g. 'Chol (mg/dl)'=chol


s <- summary(y ~ age.groups + ascites, fun=g)


par(mfrow=c(1,2), oma=c(3,0,3,0))   # allow outer margins for overall
for(ivar in 1:2) {                  # title 
  isub <- (1:3)+(ivar-1)*3          # *3=number of quantiles/var.
  plot(s3, which=isub, main='', 
       xlab=c('Cholesterol','Bilirubin')[ivar],
       pch=c(91,16,93))            # [, closed circle, ]
  }
mtext(paste('Quartiles of', label(y)), adj=.5, outer=TRUE, cex=1.75)  
#Overall (outer) title


prlatex(latex(s3, trios=TRUE)) 
# trios -> collapse 3 quartiles


#Summarize only bilirubin, but do it with two statistics:
#the mean and the median.  Make separate tables for the two randomized
#groups and make plots for the active arm.


g <- function(y) c(Mean=mean(y), Median=median(y))


for(sub in c("D-penicillamine", "placebo")) {
  ss <- summary(bili ~ age.groups + ascites + chol, fun=g,
                subset=drug==sub)
  cat('\n',sub,'\n\n')
  print(ss)


  if(sub=='D-penicillamine') {
    par(mfrow=c(1,1))
    plot(s4, which=1:2, dotfont=c(1,-1), subtitles=FALSE, main='')
    #1=mean, 2=median     -1 font = open circle
    title(sub='Closed circle: mean;  Open circle: median', adj=0)
    title(sub=sub, adj=1)
  }


  w <- latex(ss, append=TRUE, fi='my.tex', 
             label=if(sub=='placebo') 's4b' else 's4a',
             caption=paste(label(bili),' {\\\\em (',sub,')}', sep=''))
  #Note symbolic labels for tables for two subsets: s4a, s4b
  prlatex(w)
}


#Now consider examples in 'reverse' format, where the lone dependent
#variable tells the summary function how to stratify all the 
#'independent' variables.  This is typically used to make tables 
#comparing baseline variables by treatment group, for example.


s5 <- summary(drug ~ bili + albumin + stage + protime + sex + 
                     age + spiders,
              method='reverse')
#To summarize all variables, use summary(drug ~., data=pbc)
#To summarize all variables with no stratification, use
#summary(~a+b+c) or summary(~.,data=\dots)


options(digits=1)
print(s5, npct='both')
#npct='both' : print both numerators and denominators
plot(s5, which='categorical')
Key(locator(1))  # draw legend at mouse click
par(oma=c(3,0,0,0))  # leave outer margin at bottom
plot(s5, which='continuous')
Key2()           # draw legend at lower left corner of plot
                 # oma= above makes this default key fit the page better


options(digits=3)
w <- latex(s5, npct='both', here=TRUE)     
# creates s5.tex


#Turn to a different dataset and do cross-classifications on possibly 
#more than one independent variable.  The summary function with 
#method='cross' produces a data frame containing the cross-
#classifications.  This data frame is suitable for multi-panel 
#trellis displays, although `summarize' works better for that.


attach(prostate)
size.quartile <- cut2(sz, g=4)
bone <- factor(bm,labels=c("no mets","bone mets"))


s7 <- summary(ap>1 ~ size.quartile + bone, method='cross')
#In this case, quartiles are the default so could have said sz + bone


options(digits=3)
print(s7, twoway=FALSE)
s7   # same as print(s7)
w <- latex(s7, here=TRUE)   # Make s7.tex


library(trellis,TRUE)
invisible(ps.options(reset=TRUE))
trellis.device(postscript, file='demo2.ps')


dotplot(S ~ size.quartile|bone, data=s7, #s7 is name of summary stats
                  xlab="Fraction ap>1", ylab="Quartile of Tumor Size")
#Can do this more quickly with summarize:
# s7 <- summarize(ap>1, llist(size=cut2(sz, g=4), bone), mean,
#                 stat.name='Proportion')
# dotplot(Proportion ~ size | bone, data=s7)


summary(age ~ stage, method='cross')
summary(age ~ stage, fun=quantile, method='cross')
summary(age ~ stage, fun=smean.sd, method='cross')
summary(age ~ stage, fun=smedian.hilow, method='cross')
summary(age ~ stage, fun=function(x) c(Mean=mean(x), Median=median(x)),
        method='cross')
#The next statements print real two-way tables
summary(cbind(age,ap) ~ stage + bone, 
        fun=function(y) apply(y, 2, quantile, c(.25,.75)),
        method='cross')
options(digits=2)
summary(log(ap) ~ sz + bone,
        fun=function(y) c(Mean=mean(y), quantile(y)),
        method='cross')


#Summarize an ordered categorical response by all of the needed
#cumulative proportions
summary(cumcategory(disease.severity) ~ age + sex)

}
}
\keyword{category}
\keyword{interface}
\keyword{hplot}
\keyword{manip}
\concept{grouping}
\concept{stratification}
\concept{aggregation}
\concept{cross-classification}




\eof
\name{symbol.freq}
\alias{symbol.freq}
\title{Graphic Representation of a Frequency Table}
\description{
This function can be used to represent
contingency tables graphically.  Frequency counts are represented as
the heights of "thermometers" by default; you can also specify
\code{symbol='circle'} to the function.  There is an option to include
marginal frequencies, which are plotted on a halved scale so as to not
overwhelm the plot.   If you do not ask for marginal frequencies to be
plotted using \code{marginals=T}, \code{symbol.freq} will ask you to click
the mouse where a reference symbol is to be drawn to assist in reading
the scale of the frequencies.

\code{label} attributes, if present, are used for x- and y-axis labels.
Otherwise, names of calling arguments are used.
}
\usage{
symbol.freq(x, y, symbol = c("thermometer", "circle"),
            marginals = FALSE, orig.scale = FALSE,
            inches = 0.25, width = 0.15, subset, srtx = 0, ...)
}
\arguments{
  \item{x}{first variable to cross-classify}
  \item{y}{second variable}
  \item{symbol}{specify \code{"thermometer"} (the default) or \code{"circle"}}
  \item{marginals}{set to \code{TRUE} to add marginal frequencies
	(scaled by half) to the plot}
  \item{orig.scale}{set to \code{TRUE} when the first two arguments are
	numeric variables; this uses their original values for x and y
	coordinates)} 
  \item{inches}{see \code{\link{symbols}}}
  \item{width}{see \code{thermometers} option in \code{symbols}}
  \item{subset}{the usual subsetting vector}
  \item{srtx}{rotation angle for x-axis labels}
  \item{\dots}{other arguments to pass to \code{symbols}}
}
\author{Frank Harrell}
\seealso{\code{\link{symbols}}}
\examples{
\dontrun{
getHdata(titanic)
attach(titanic)
age.tertile <- cut2(titanic$age, g=3)
symbol.freq(age.tertile, pclass, marginals=T, srtx=45)
detach(2)
}}
\keyword{hplot}


\eof
\name{sys}
\alias{sys}
\title{
Run Unix or Dos Depending on System
}
\description{
Runs \code{unix} or \code{dos} depending on the current operating system.  For
\R, just runs \code{system} with optional concatenation of first two
arguments which are assumed named \code{command} and \code{text}.
}
\usage{
sys(command, text=NULL, output=TRUE)
# S-Plus: sys(\dots, minimized=FALSE)
}
\arguments{
\item{command}{
system command to execute
}
\item{text}{
text to concatenate to system command, if any (typically options or file
names or both)
}
\item{output}{
  set to \code{FALSE} to not return output of command as a character
  vector
}
}
\value{
see \code{unix} or \code{dos}
}
\section{Side Effects}{
executes system commands
}
\seealso{
\code{\link{unix}}, \code{\link{system}}
}
\keyword{interface}

\eof
\name{t.test.cluster}
\alias{t.test.cluster}
\alias{print.t.test.cluster}
\title{t-test for Clustered Data}
\description{
Does a 2-sample t-test for clustered data.
}
\usage{
t.test.cluster(y, cluster, group, conf.int = 0.95)
\method{print}{t.test.cluster}(x, digits, \dots)
}
\arguments{
  \item{y}{normally distributed response variable to test}
  \item{cluster}{cluster identifiers, e.g. subject ID}
  \item{group}{grouping variable with two values}
  \item{conf.int}{confidence coefficient to use for confidence limits}
  \item{x}{an object created by \code{t.test.cluster}}
  \item{digits}{number of significant digits to print}
  \item{\dots}{unused}
}
\value{
  a matrix of statistics of class \code{t.test.cluster}
}
\references{
  Donner A, Birkett N, Buck C, Am J Epi 114:906-914, 1981.

  Donner A, Klar N, J Clin Epi 49:435-439, 1996.
  
  Hsieh FY, Stat in Med 8:1195-1201, 1988.
}
\author{Frank Harrell}
\seealso{\code{\link{t.test}}}
\examples{
set.seed(1)
y <- rnorm(800)
group <- sample(1:2, 800, TRUE)
cluster <- sample(1:40, 800, TRUE)
table(cluster,group)
t.test(y ~ group)   # R only
t.test.cluster(y, cluster, group)
# Note: negate estimates of differences from t.test to
# compare with t.test.cluster
}
\keyword{htest}


\eof
\name{transace}
\alias{transace}
\alias{areg.boot}
\alias{print.areg.boot}
\alias{plot.areg.boot}
\alias{predict.areg.boot}
\alias{summary.areg.boot}
\alias{print.summary.areg.boot}
\alias{Function.areg.boot}
\alias{Mean}
\alias{Mean.areg.boot}
\alias{Quantile}
\alias{Quantile.areg.boot}
\alias{monotone}
\alias{smearingEst}
\title{
Additive Regression and Transformations using ace or avas
}
\description{
\code{transace} is \code{ace} packaged for easily automatically
transforming all variables in a matrix.  \code{transace} is a fast
one-iteration version of \code{transcan} without imputation of NAs.  

\code{areg.boot} uses \code{avas} or \code{ace} to fit additive regression models
allowing all variables in the model (including the right-hand-side) to
be transformed, with transformations chosen so as to optimize
certain criteria.  The default method uses \code{avas}, which
explicity tries to transform the response variable so as to stabilize
the variance of the residuals.  All-variables-transformed models tend
to inflate \code{R^2} and it can be difficult to get confidence limits for
each transformation.  \code{areg.boot} solves both of these problems using
the bootstrap.  As with the \code{validate} function in the Design library,
the Efron bootstrap is used to estimate the optimism in the apparent \code{R^2},
and this optimism is subtracted from the apparent \code{R^2} to optain a
bias-corrected \code{R^2}.  This is done however on the transformed response
variable scale.


Tests with 3 predictors show that the AVAS and ACE estimates used by
\code{areg.boot} are unstable unless the sample size exceeds 350.  Apparent
\code{R^2} with low sample sizes can be very inflated, and bootstrap
estimates of \code{R^2} can be even more unstable in such cases,
resulting in optimism-corrected \code{R^2} that are much lower even than the
actual \code{R^2}.  The situation can be improved a little by restricting
predictor transformations to be monotonic.


For \code{method="avas"} the response transformation is restricted to be
monotonic.  You can specify restrictions for transformations of
predictors (and linearity for the response, or its monotonicity if
using \code{ace}).  When the first argument is a formula, the function
automatically determines which variables are categorical (i.e., \code{factor},
\code{category}, or character vectors).  Specify linear transformations by
enclosing variables by the identify function (\code{I()}), and specify
monotonicity by using \code{monotone(variable)}.


The \code{summary} method for \code{areg.boot} computes bootstrap estimates of
standard errors of differences in predicted responses (usually 
on the original scale)
for selected levels of each predictor against the lowest level of the
predictor.  The smearing estimator (see below) can be used here to
estimate differences in predicted means, medians, or many other
statistics.  By default, quartiles are used for continuous predictors
and all levels are used for categorical ones.  See DETAILS below.
There is also a \code{plot} method for plotting transformation estimates,
transformations for individual bootstrap re--samples, and pointwise
confidence limits for transformations.  Unless you already have a
\code{par(mfrow=)} in effect with more than one row or column, \code{plot} will
try to fit the plots on one page.  A \code{predict} method computes
predicted values on the original or transformed response scale, or a
matrix of transformed predictors.  There is a \code{Function} method
for producing a list of S-PLUS functions that perform the final fitted
transformations.  There is also a \code{print} method for \code{areg.boot}
objects.


When estimated means (or medians or other statistical parameters) are
requested for models fitted with \code{areg.boot} (by \code{summary.areg.boot}
or \code{predict.areg.boot}), the "smearing" estimator of Duan (1983) is
used.  Here we estimate the mean of the untransformed response by
computing the arithmetic mean of ginverse(lp + residuals), where
ginverse is the inverse of the nonparametric transformation of the
response (obtained by reverse linear interpolation), \code{lp} is the
linear predictor for an individual observation on the transformed
scale, and \code{residuals} is the entire vector of residuals estimated
from the fitted model, on the transformed scales (n residuals for n
original observations).  The \code{smearingEst} function computes the
general smearing estimate.  For efficiency \code{smearingEst} recognizes
that quantiles are transformation-preserving, i.e., when one wishes to
estimate a quantile of the untransformed distribution one just needs
to compute the inverse transformation of the transformed estimate
after the chosen quantile of the vector of residuals is added to it.
When the median is desired, the estimate is ginverse(lp +
median(residuals)).  See the last example for how \code{smearingEst} can be
used outside of \code{areg.boot}.

\code{Mean} is a generic function that returns an S function to compute
the estimate of the mean of a variable.  Its input is typically some
kind of model fit object.  Likewise, \code{Quantile} is a generic
quantile function-producing function.  \code{Mean.areg.boot} and
\code{Quantile.areg.boot} create functions of a vector of linear
predictors that transform them into the smearing estimates of the mean
or quantile of the response variable, respectively.
\code{Quantile.areg.boot} produces exactly the same value as
\code{predict.areg.boot} or \code{smearingEst}.  \code{Mean}
approximates the mapping of linear predictors to means over an evenly
spaced grid of by default 200 points.  Linear interpolation is used
between these points.  This approximate method is much faster than the
full smearing estimator once \code{Mean} creates the function.  These
functions are especially useful in \code{nomogram.Design} (see the
example on hypothetical data).  }

\usage{
transace(x, monotonic=NULL, categorical=NULL, binary=NULL, pl=TRUE)

areg.boot(x, y, data, weights, subset, na.action=na.delete, 
          B=100, method=c("avas", "ace"), evaluation=100, valrsq=TRUE, 
          probs=c(.25,.5,.75), \dots)

\method{print}{areg.boot}(x, \dots)

\method{plot}{areg.boot}(x, ylim, boot=TRUE, col.boot=2, lwd.boot=.15,
conf.int=.95, \dots)

smearingEst(transEst, inverseTrans, res,
            statistic=c('median','quantile','mean','fitted','lp'),
            q)

\method{summary}{areg.boot}(object, conf.int=.95, values, adj.to,
        statistic='median', q, \dots)

\method{print}{summary.areg.boot}(x, \dots)

\method{predict}{areg.boot}(object, newdata,
         statistic=c("lp", "median",
                     "quantile", "mean", "fitted", "terms"),
         q=NULL, \dots) 

\method{Function}{areg.boot}(object, type=c('list','individual'),
         ytype=c('transformed','inverse'),
         prefix='.', suffix='', frame=0, where=1, \dots)

Mean(object, \dots)

Quantile(object, \dots)

\method{Mean}{areg.boot}(object, evaluation=200, \dots)

\method{Quantile}{areg.boot}(object, q=.5, \dots)
}
\arguments{
\item{x}{
for \code{transace} a numeric matrix.  For \code{areg.boot} \code{x} may
be a numeric matrix or a formula.  For \code{print} or \code{plot}, an
object created by \code{areg.boot}.  For \code{print.summary.areg.boot},
and object created by \code{summary.areg.boot}.
}
\item{object}{
an object created by \code{areg.boot}, or a model fit object suitable
for \code{Mean} or \code{Quantile}.
}
\item{transEst}{
a vector of transformed values.  In log-normal regression these could
be predicted log(Y) for example.
}
\item{inverseTrans}{
a function specifying the inverse transformation needed to change
\code{transEst} to the original untransformed scale.  \code{inverseTrans} may
also be a 2-element list defining a mapping from the transformed
values to untransformed values.  Linear interpolation is used in
this case to obtain untransform values.
}
\item{monotonic}{
}
\item{categorical}{
}
\item{binary}{
These are vectors of variable names specifying what to assume about
each column of \code{x} for \code{transace}.  Binary variables are not
transformed, of course.
}
\item{pl}{
set \code{pl=FALSE} to prevent \code{transace} from plotting each fitted transformation
}
\item{y}{
numeric vector representing the response variable.
Not used if \code{x} is a formula.
}
\item{data}{
data frame to use if \code{x} is a formula and variables are not already in
the search list
}
\item{weights}{
a numeric vector of observation weights.  By default, all observations
are weighted equally.
}
\item{subset}{
an expression to subset data if \code{x} is a formula
}
\item{na.action}{
a function specifying how to handle NAs.  Default is \code{na.delete} (in Hmisc).
}
\item{B}{
number of bootstrap samples (default=100)
}
\item{method}{
\code{"avas"} (the default) or \code{ace}
}
\item{evaluation}{
number of equally-spaced points at which to evaluate (and save) the
nonparametric transformations derived by \code{avas} or \code{ace}.  Default is
100.  For \code{Mean.areg.boot}, \code{evaluation} is the number of points at
which to evaluate exact smearing estimates, to approximate them using
linear interpolation (default is 200).
}
\item{valrsq}{
set to \code{TRUE} to more quickly do bootstrapping without validating \code{R^2}
}
\item{probs}{
vector probabilities denoting the quantiles of continuous predictors
to use in estimating effects of those predictors
}
\item{\dots}{
other arguments to pass to \code{avas} or \code{ace} (useful if \code{x} is not a
formula)
}
\item{res}{
a vector of residuals from the transformed model.  Not required when
\code{statistic="lp"} or \code{statistic="fitted"}.
}
\item{statistic}{
statistic to estimate with the smearing estimator.  For \code{smearingEst},
the default results in computation of the sample median of the model
residuals, then \code{smearingEst} adds the median residual and
back-transforms to get estimated median responses on the original
scale.  \code{statistic="lp"} causes predicted transformed responses to be
computed.  For \code{smearingEst}, the result (for \code{statistic="lp"}) is the
input argument \code{transEst}.  \code{statistic="fitted"} gives predicted
untransformed responses, i.e., ginverse(lp), where ginverse is the
inverse of the estimated response transformation, estimated by reverse
linear interpolation on the tabulated nonparametric response
transformation or by using an explicit analytic function.
\code{statistic="quantile"} generalizes \code{"median"} to any single quantile
\code{q} which must be specified.  "mean"' causes the population mean
response to be estimated.  For \code{predict.areg.boot},
\code{statistic="terms"} returns a matrix of transformed predictors.
\code{statistic} can also be any S-PLUS function that computes a single
value on a vector of values, such as \code{statistic=var}.  Note that in
this case the function name is not quoted.
}
\item{q}{
a single quantile of the original response scale to estimate, when
\code{statistic="quantile"}, or for \code{Quantile.areg.boot}.
}
\item{ylim}{
2-vector of y-axis limits
}
\item{boot}{
set to \code{FALSE} to not plot any bootstrapped transformations.  Set it to an
integer \code{k} to plot the first \code{k} bootstrap estimates.
}
\item{col.boot}{
color for bootstrapped transformations
}
\item{lwd.boot}{
line width for bootstrapped transformations
}
\item{conf.int}{
confidence level (0-1) for pointwise bootstrap confidence limits and
for estimated effects of predictors in \code{summary.areg.boot}.  The
latter assumes normality of the estimated effects.
}
\item{values}{
a list of vectors of settings of the predictors, for predictors for
which you want to overide settings determined from \code{probs}.  The list
must have named components, with names corresponding to the
predictors.  Example: \code{values=list(x1=c(2,4,6,8), x2=c(-1,0,1))}
specifies that \code{summary} is to estimate the effect on \code{y} of changing
\code{x1} from 2 to 4, 2 to 6, 2 to 8, and separately, of changing \code{x2}
from -1 to 0 and -1 to 1.
}
\item{adj.to}{
a named vector of adjustment constants, for setting all other
predictors when examining the effect of a single predictor in
\code{summary}.  The more nonlinear is the transformation of \code{y} the more
the adjustment settings will matter.  Default values are the medians
of the values defined by \code{values} or \code{probs}.  You only need to name
the predictors for which you are overriding the default settings.
Example: \code{adj.to=c(x2=0,x5=10)} will set \code{x2} to 0 and \code{x5} to 10 when
assessing the impact of variation in the other predictors.
}
\item{newdata}{
a data frame or list containing the same number of values of all of
the predictors used in the fit.  For \code{factor} predictors the \code{levels}
attribute do not need to be in the same order as those used in the
original fit, and not all levels need to be represented.  If \code{newdata}
is omitted, you can still obtain linear predictors (on the transformed
response scale) and fitted values (on the original response scale),
but not \code{"terms"}. 
}
\item{type}{
specifies how \code{Function} is to return the series of functions that
define the transformations of all variables.  By default a list is
created, with the names of the list elements being the names of the
variables.  Specify \code{type="individual"} to have separate functions
created in the session frame (\code{frame=0}, the default) or in location
defined by \code{where} if \code{where} is specified.  For the latter method,
the names of the objects created are the names of the corresponding
variables, prefixed by \code{prefix} and with \code{suffix} appended to the end.
If any of \code{frame}, \code{where}, \code{prefix}, or \code{suffix} is specified, \code{type}
is automatically set to \code{"individual"}.
}
\item{ytype}{
By default the first function created by \code{Function} is the
y-transformation.  Specify \code{ytype="inverse"} to instead create the
inverse of the transformation, to be able to obtain originally scaled
y-values.
}
\item{prefix}{
character string defining the prefix for function names created when
\code{type="individual"}.  By default, the function specifying the
transformation for variable \code{x} will be named \code{.x}.
}
\item{suffix}{
character string defining the suffix for the function names
}
\item{frame}{
frame number in which to store functions (see \code{assign}).  The default
is frame 0, the session database, which disappears at the end of the
S-Plus session.
}
\item{where}{
location in which to store functions (see \code{assign}).  If \code{where} is
specified (e.g., \code{where=1} to store functions in search position one),
\code{frame} is ignored.  For R, the value of \code{where} is passed to \code{assign}
as the \code{pos} argument.
}}
\value{
\code{transace} returns a matrix like \code{x} but containing transformed
values.  This matrix has attributes \code{rsq} (vector of \code{R^2} with which
each variable can be predicted from the others) and \code{omitted} (row
numbers of \code{x} that were deleted due to NAs).


\code{areg.boot} returns a list of class \code{"areg.boot"} containing many
elements, including (if \code{valrsq} is \code{TRUE}) \code{rsquare.app} and
\code{rsquare.val}.  \code{summary.areg.boot} returns a list of class
\code{"summary.areg.boot"} containing a matrix of results for each
predictor and a vector of adjust-to settings.  It also contains the
call and a \code{label} for the statistic that was computed.  A \code{print} method for
these objects handles the printing.  \code{predict.areg.boot} returns a
vector unless \code{statistic="terms"}, in which case it returns a matrix. 
\code{Function.areg.boot} returns by default a list of functions whose
argument is one of the variables (on the original scale) and whose
returned values are the corresponding transformed values.  The names
of the list of functions correspond to the names of the original
variables.  When \code{type="individual"}, \code{Function.areg.boot} invisibly
returns the vector of names of the created function objects.
\code{Mean.areg.boot} and \code{Quantile.areg.boot} also return functions.


\code{smearingEst} returns a vector of estimates of distribution parameters
of class \code{"labelled"} so that \code{print.labelled} wil print a label
documenting the estimate that was used (see \code{label}).  This label can
be retrieved for other purposes by using e.g. \code{label(obj)}, where
\code{obj} was the vector returned by \code{smearingEst}.
}
\details{
As \code{transace} only does one iteration over the predictors, it may not
find optimal transformations and it will be dependent on the order of
the predictors in \code{x}.


\code{ace} and \code{avas} standardize transformed variables to have mean zero
and variance one for each bootstrap sample, so if a predictor is not
important it will still consistently have a positive regression
coefficient.  Therefore using the bootstrap to estimate standard
errors of the additive least squares regression coefficients would not
help in drawing inferences about the importance of the predictors.  To
do this, \code{summary.areg.boot} computes estimates of, e.g., the
inter-quartile range effects of predictors in predicting the
response variable (after untransforming it).  As an example, at each
bootstrap repetition the estimated transformed value of one of the
predictors is computed at the lower quartile, median, and upper
quartile of the raw value of the predictor.  These transformed x
values are then multipled by the least squares estimate of the
partial regression coefficient for that transformed predictor in
predicting transformed y.  Then these weighted transformed x values have the
weighted transformed x value corresponding to the lower quartile
subtracted from them, to estimate an x effect accounting for
nonlinearity.  The last difference computed is then the standardized
effect of raising x from its lowest to its highest quartile.  Before
computing differences, predicted values are back-transformed to be on
the original y scale in a way depending on \code{statistic} and \code{q}.
The sample standard deviation of these effects
(differences) is taken over the bootstrap samples, and this is used to
compute approximate confidence intervals for effects and approximate
P-values, both assuming normality.


\code{predict} does not re-insert NAs corresponding to observations that
were dropped before the fit, when \code{newdata} is omitted.


\code{statistic="fitted"} estimates the same quantity as \code{statistic="median"} if the
residuals on the transformed response have a symmetric distribution.
The two provide identical estimates when the sample median of the
residuals is exactly zero. The sample mean of the residuals is
constrained to be exactly zero although this does not simplify anything.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[acepack]{avas}}, \code{\link[acepack]{ace}}, \code{\link[Design]{ols}}, \code{\link[Design]{validate}}, \code{\link[Design]{predab.resample}}, \code{\link{label}},
\code{\link[Design]{nomogram}}
}
\references{
Harrell FE, Lee KL, Mark DB (1996): Stat in Med 15:361--387.


Duan N (1983): Smearing estimate: A nonparametric retransformation
method.  JASA 78:605--610.


Wang N, Ruppert D (1995): Nonparametric estimation of the
transformation in the transform-both-sides regression model.  JASA
90:522--534. 


See \code{avas}, \code{ace} for primary references.
}
\examples{
# xtrans <- transace(cbind(age,sex,blood.pressure,race.code),
#                    binary='sex', monotonic='age',
#                    categorical='race.code')


# Generate random data from the model y = exp(x1 + epsilon/3) where
# x1 and epsilon are Gaussian(0,1)
set.seed(171)  # to be able to reproduce example
x1 <- rnorm(200)
x2 <- runif(200)  # a variable that is really unrelated to y]
x3 <- factor(sample(c('cat','dog','cow'), 200,TRUE))  # also unrelated to y
y  <- exp(x1 + rnorm(200)/3)
f  <- areg.boot(y ~ x1 + x2 + x3, B=40)
f
plot(f)
# Note that the fitted transformation of y is very nearly log(y)
# (the appropriate one), the transformation of x1 is nearly linear,
# and the transformations of x2 and x3 are essentially flat 
# (specifying monotone(x2) would have resulted in a smaller 
# confidence band for x2)


summary(f)


# use summary(f, values=list(x2=c(.2,.5,.8))) for example if you
# want to use nice round values for judging effects


# Plot Y hat vs. Y (this doesn't work if there were NAs)
plot(fitted(f), y)  # or: plot(predict(f,statistic='fitted'), y)


# Show fit of model by varying x1 on the x-axis and creating separate
# panels for x2 and x3.  For x2 using only a few discrete values
newdat <- expand.grid(x1=seq(-2,2,length=100),x2=c(.25,.75),
                      x3=c('cat','dog','cow'))
yhat <- predict(f, newdat, statistic='fitted')  
# statistic='mean' to get estimated mean rather than simple inverse trans.
xYplot(yhat ~ x1 | x2, groups=x3, type='l', data=newdat)


\dontrun{
# Another example, on hypothetical data
f <- areg.boot(response ~ I(age) + monotone(blood.pressure) + race)
# use I(response) to not transform the response variable
plot(f, conf.int=.9)
# Check distribution of residuals
plot(fitted(f), resid(f))
qqnorm(resid(f))
# Refit this model using ols so that we can draw a nomogram of it.
# The nomogram will show the linear predictor, median, mean.
# The last two are smearing estimators.
Function(f, type='individual')  # create transformation functions
f.ols <- ols(.response(response) ~ age + 
             .blood.pressure(blood.pressure) + .race(race))
# Note: This model is almost exactly the same as f but there
# will be very small differences due to interpolation of
# transformations
meanr <- Mean(f)      # create function of lp computing mean response
medr  <- Quantile(f)  # default quantile is .5
nomogram(f.ols, fun=list(Mean=meanr,Median=medr))


# Create S-PLUS functions that will do the transformations
# This is a table look-up with linear interpolation
g <- Function(f)
plot(blood.pressure, g$blood.pressure(blood.pressure))
# produces the central curve in the last plot done by plot(f)
}


# Another simulated example, where y has a log-normal distribution
# with mean x and variance 1.  Untransformed y thus has median
# exp(x) and mean exp(x + .5sigma^2) = exp(x + .5)
# First generate data from the model y = exp(x + epsilon),
# epsilon ~ Gaussian(0, 1)


set.seed(139)
n <- 1000
x <- rnorm(n)
y <- exp(x + rnorm(n))
f <- areg.boot(y ~ x, B=20)
plot(f)       # note log shape for y, linear for x.  Good!
xs <- c(-2, 0, 2)
d <- data.frame(x=xs)
predict(f, d, 'fitted')
predict(f, d, 'median')   # almost same; median residual=-.003
exp(xs)                   # population medians
predict(f, d, 'mean')
exp(xs + .5)              # population means


# Show how smearingEst works
res <- c(-1,0,1)          # define residuals
y <- 1:5
ytrans <- log(y)
ys <- seq(.1,15,length=50)
trans.approx <- list(x=log(ys), y=ys)
options(digits=4)
smearingEst(ytrans, exp, res, 'fitted')          # ignores res
smearingEst(ytrans, trans.approx, res, 'fitted') # ignores res 
smearingEst(ytrans, exp, res, 'median')          # median res=0
smearingEst(ytrans, exp, res+.1, 'median')       # median res=.1
smearingEst(ytrans, trans.approx, res, 'median')
smearingEst(ytrans, exp, res, 'mean')
mean(exp(ytrans[2] + res))                       # should equal 2nd # above
smearingEst(ytrans, trans.approx, res, 'mean')
smearingEst(ytrans, trans.approx, res, mean)
# Last argument can be any statistical function operating
# on a vector that returns a single value
}
\keyword{nonparametric}
\keyword{smooth}
\keyword{multivariate}
\keyword{nonlinear}
\keyword{regression}
\concept{bootstrap}
\concept{model validation}

\eof
\name{transcan}
\alias{transcan}
\alias{summary.transcan}
\alias{print.transcan}
\alias{plot.transcan}
\alias{impute.transcan}
\alias{predict.transcan}
\alias{Function}
\alias{Function.transcan}
\alias{fit.mult.impute}
\alias{Varcov}
\alias{Varcov.default}
\alias{Varcov.fit.mult.impute}
\alias{Varcov.glm}
\alias{Varcov.lm}
\alias{Varcov.multinom}
\alias{[.transcan}
\alias{invertTabulated}
\title{
Transformations/Imputations using Canonical Variates
}
\description{
\code{transcan} is a nonlinear additive transformation and imputation
function, and there are several functions for using and operating on
its results.  \code{transcan} automatically transforms continuous and
categorical variables to have maximum correlation with the best linear
combination of the other variables.  There is also an option to use a
substitute criterion - maximum correlation with the first principal
component of the other variables.  Continuous variables are expanded
as restricted cubic splines and categorical variables are expanded as
contrasts (e.g., dummy variables).  By default, the first canonical
variate is used to find optimum linear combinations of component
columns.  This function is similar to \code{ace} except that
transformations for continuous variables are fitted using restricted
cubic splines, monotonicity restrictions are not allowed, and NAs are
allowed.  When a variable has any NAs, transformed scores for that
variable are imputed using least squares multiple regression
incorporating optimum transformations, or NAs are optionally set to
constants.  Shrinkage can be used to safeguard against overfitting
when imputing.  Optionally, imputed values on the original scale are
also computed and returned.  For this purpose, recursive partitioning
or multinomial logistic models can
optionally be used to impute categorical variables, using what is
predicted to be the most probable category.

By default, \code{transcan} imputes NAs with "best guess" expected values
of transformed variables, back transformed to the original scale.
Values thus imputed are most like conditional medians assuming the
transformations make variables' distributions symmetric (imputed
values are similar to conditionl modes for categorical variables).  By
instead specifying \code{n.impute}, \code{transcan} does approximate multiple imputation
from the distribution of each variable conditional on all other
variables.  This is done by sampling \code{n.impute} residuals from the
transformed variable, with replacement (a la bootstrapping), or by
default, using Rubin's approximate Bayesian bootstrap, where a sample
of size n with replacement is selected from the residuals on n
non-missing values of the target variable, and then a sample of size m
with replacement is chosen from this sample, where m is the number of
missing values needing imputation for the current multiple imputation 
repetition.  Neither of these bootstrap procedures
assume normality or even symmetry of residuals.
For sometimes-missing categorical variables, optimal scores are
computed by adding the "best guess" predicted mean score to random
residuals off this score.  Then categories having scores closest to
these predicted scores are taken as the random multiple imputations
(\code{impcat="tree"} or \code{"rpart"} are not currently allowed with
\code{n.impute}).  The literature recommends using \code{n.impute=5} or greater.
\code{transcan} provides only an approximation to multiple imputation,
especially since it "freezes" the imputation model before drawing the
multiple imputations rather than using different estimates of
regression coefficients for each imputation.  For multiple imputation,
the \code{aregImpute} function provides a much better approximation to the
full Bayesian approach while still not requiring linearity assumptions.

When you specify \code{n.impute} to \code{transcan} you can use
\code{fit.mult.impute} to re-fit any model \code{n.impute} times based on
\code{n.impute} completed datasets (if there are any sometimes missing
variables not specified to \code{transcan}, some observations will still be
dropped from these fits).  After fitting \code{n.impute} models,
\code{fit.mult.impute} will return the fit object from the last imputation,
with \code{coefficients} replaced by the average of the \code{n.impute}
coefficient vectors and with a component \code{var} equal to the
imputation-corrected variance-covariance matrix.  \code{fit.mult.impute}
can also use the object created by the \code{mice} function in the MICE
library to draw the multiple imputations, as well as objects created
by \code{aregImpute}.

The \code{summary} method for \code{transcan} prints the function call,
R-squares achieved in transforming each variable, and for each variable
the coefficients of all other transformed variables that are used to
estimate the transformation of the initial variable.  If
\code{imputed=TRUE} was used in the call to transcan, also uses the
\code{describe} function to print a summary of imputed values.  If
\code{long=TRUE}, also prints all imputed values with observation
identifiers.  There is also a simple function \code{print.transcan}
which merely prints the transformation matrix and the function call.  It
has an optional argument \code{long}, which if set to \code{TRUE} causes
detailed parameters to be printed.  Instead of plotting while
\code{transcan()} is running, you can plot the final transformations
after the fact using \code{plot.transcan}, if the option
\code{trantab=TRUE} was specified to \code{transcan}.  If in addition
the option \code{imputed=TRUE} was specified to \code{transcan},
\code{plot.transcan} will show the location of imputed values (including
multiples) along the axes.

\code{impute} does imputations for a selected original data variable, on
the original scale (if \code{imputed=TRUE} was given to
\code{transcan}).  If you do not specify a variable to \code{impute}, it
will do imputations for all variables given to \code{transcan} which had
at least one missing value.  This assumes that the original variables
are accessible (i.e., they have been \code{attach}ed) and that you want
the imputed variables to have the same names are the original variables.
If \code{n.impute} was specified to \code{transcan} you must tell
\code{impute} which \code{imputation} to use.

\code{predict} computes predicted variables and imputed values from a
matrix of new data.  This matrix should have the same column variables
as the original matrix used with \code{transcan}, and in the same order
(unless a formula was used with \code{transcan}).

\code{Function} is a generic function generator.
\code{Function.transcan} creates S functions to transform variables using
transformations created by \code{transcan}.  These functions are useful
for getting predicted values with predictors set to values on the original
scale.

\code{Varcov} methods are defined here so that imputation-corrected
variance-covariance matrices are readily extracted from
\code{fit.mult.impute} objects, and so that \code{fit.mult.impute} can easily
compute traditional covariance matrices for individual completed
datasets.  Specific \code{Varcov} methods are defined for \code{lm},
\code{glm}, and \code{multinom} fits.

The subscript function preserves attributes.

The \code{invertTabulated} function does either inverse linear
interpolation or uses sampling to sample qualifying x-values having
y-values near the desired values.  The latter is used to get inverse
values having a reasonable distribution (e.g., no floor or ceiling
effects) when the transformation has a flat or nearly flat segment,
resulting in a many-to-one transformation in that region.  Sampling
weights are a combination of the frequency of occurrence of x-values
that are within \code{tolInverse} times the range of \code{y} and the squared
distance between the associated y-values and the target y-value (\code{aty}).
}
\usage{
transcan(x, method=c("canonical","pc"),
         categorical=NULL, asis=NULL, nk, imputed=FALSE, n.impute,
         boot.method=c('approximate bayesian', 'simple'),
         trantab=FALSE, transformed=FALSE, 
         impcat=c("score", "multinom", "rpart", "tree"),
         mincut=40, 
         inverse=c('linearInterp','sample'), tolInverse=.05,
         pr=TRUE, pl=TRUE, allpl=FALSE, show.na=TRUE, 
         imputed.actual=c('none','datadensity','hist','qq','ecdf'),
         iter.max=50, eps=.1, curtail=TRUE, 
         imp.con=FALSE, shrink=FALSE, init.cat="mode", 
         nres=if(boot.method=='simple')200 else 400,
         data, subset, na.action, treeinfo=FALSE, 
         rhsImp=c('mean','random'), details.impcat='', \dots)

\method{summary}{transcan}(object, long=FALSE, ...)

\method{print}{transcan}(x, long=FALSE, ...)

\method{plot}{transcan}(x, \dots)

\method{impute}{transcan}(x, var, imputation, name, where.in, data, 
       where.out=1, frame.out, list.out=FALSE, pr=TRUE, check=TRUE, ...)

fit.mult.impute(formula, fitter, xtrans, data, n.impute, fit.reps=FALSE,
                derived, pr=TRUE, subset, \dots)

\method{predict}{transcan}(object, newdata, iter.max=50, eps=0.01, curtail=TRUE, 
        type=c("transformed","original"),
        inverse, tolInverse, ...)

Function(object, \dots)

\method{Function}{transcan}(object, prefix=".", suffix="", where=1, ...)

invertTabulated(x, y, freq=rep(1,length(x)), 
                aty, name='value',
                inverse=c('linearInterp','sample'),
                tolInverse=0.05, rule=2)

Varcov(object, ...)

\method{Varcov}{default}(object, regcoef.only=FALSE, ...)

\method{Varcov}{lm}(object, ...)

\method{Varcov}{glm}(object, ...)

\method{Varcov}{multinom}(object, ...)

\method{Varcov}{fit.mult.impute}(object, ...)


}
\arguments{
\item{x}{
a matrix containing continuous variable values and codes for categorical
variables.  The matrix must have column names (\code{dimnames}).  If row
names are present, they are used in forming the \code{names} attribute
of imputed values if \code{imputed=TRUE}.  \code{x} may also be a formula, in which
case the model matrix is created automatically, using data in the calling
frame.  Advantages of using a formula are that \code{categorical} variables
can be determined automatically by a variable being a \code{factor}
variable, and variables with two unique levels are modeled \code{asis}.
Variables with 3 unique values are considered to be \code{categorical} if
a formula is specified.  For a formula you may also specify that a
variable is to remain untransformed by enclosing its name with the
identify function, e.g. \code{I(x3)}.  The user may add other variable names to the
\code{asis} and \code{categorical} vectors.  For \code{invertTabulated}, \code{x} is a
vector or a list with three components: the x vector, the
corresponding vector of transformed values, and the corresponding
vector of frequencies of the pair of original and transformed variables.
For \code{print}, \code{plot}, \code{impute}, and
\code{predict}, \code{x} is an object created by \code{transcan}.
}
\item{formula}{
any S model formula
}
\item{fitter}{
any S or Design modeling function (not in quotes) that computes a
vector of \code{coefficients} and for which \code{Varcov} will return a
variance-covariance matrix.  E.g., \code{fitter=lm, glm, ols}.  At present models
involving non-regression parameters (e.g., scale parameters in
parametric survival models) are not handled fully.
}
\item{xtrans}{
an object created by \code{transcan}, \code{aregImpute}, or \code{Mice}
}
\item{method}{
use \code{method="canonical"} or any abbreviation thereof, to use canonical
variates (the default).  
\code{method="pc"} transforms a variable instead so as to maximize
the correlation with the first principal component of the other
variables.
}
\item{categorical}{
a character vector of names of variables in \code{x} which are categorical,
for which the ordering of re-scored values is not necessarily preserved.
If \code{categorical} is omitted, it is assumed that all variables are
continuous (or binary).  Set \code{categorical="*"} to treat all variables
as categorical.
}
\item{asis}{
a character vector of names of variables that are not to be transformed.
For these variables, the guts of \code{lm.fit.qr} is used to impute missing values.
You may want to treat binary variables \code{asis} (this is automatic if
using a formula).  If imputed=TRUE, you
may want to use \code{"categorical"} for binary variables if you want
to force imputed values to be one of the original data values.
Set \code{asis="*"} to treat all variables \code{asis}.
}
\item{nk}{
number of knots to use in expanding each continuous variable (not listed
in \code{asis}) in a restricted cubic spline function.  Default is 3 (yielding
2 parameters for a variable) if \code{n < 30}, 4 if \code{30 <= n < 100}, and 5 if
\code{n >= 100} (4 parameters).
}
\item{imputed}{
Set to \code{TRUE} to return a list containing imputed values on the original
scale.
If the transformation for a variable is non-monotonic, imputed
values are not unique.  \code{transcan} uses the \code{approx} function,
which returns the highest value of the variable with the transformed
score equalling the imputed score. \code{imputed=TRUE} also causes original-scale imputed values to be shown as tick
marks on the top margin of each graph
when \code{show.na=TRUE} (for the final iteration only).
For categorical predictors, these imputed values are \code{jitter}ed so
that their frequencies can be visualized.  When \code{n.impute} is used,
each NA will have \code{n.impute} tick marks.
}
\item{n.impute}{
number of multiple imputations.  If omitted, single predicted expected
value imputation is used.  \code{n.impute=5} is frequently recommended.
}
\item{boot.method}{
default is to use the approximate Bayesian bootstrap (sample with
replacement from sample with replacement of the vector of residuals).
You can also specify \code{boot.method="simple"} to use the usual
bootstrap one-stage sampling with replacement.
}
\item{trantab}{
Set to \code{TRUE} to add an attribute \code{trantab} to the returned matrix.  This
contains a vector of lists each with components \code{x} and \code{y} containing
the unique values and corresponding transformed values for the
columns of \code{x}.  This is set up to be used easily with the \code{approx}
function.  You must specify \code{trantab=TRUE} if you want to later use the
\code{predict.transcan} function with \code{type="original"}.
}
\item{transformed}{
set to \code{TRUE} to cause \code{transcan} to return an object \code{transformed}
containing the matrix of transformed variables
}
\item{impcat}{
This argument tells how to impute categorical variables on the original
scale. 
The default is \code{impcat="score"} to impute the category
whose canonical variate score is closest to the predicted score.
Use \code{impcat="tree"} to impute categorical variables using the
\code{tree()} function, using the values of all other transformed
predictors.  \code{impcat="rpart"} will use \code{rpart}.  A better but somewhat
slower approach is to use \code{impcat="multinom"} to fit a multinomial
logistic model to the categorical variable, at the last iteraction of
the \code{transcan} algorithm.  This uses the \code{multinom} function in the
\code{nnet} library of the \code{MASS} package (which is assumed to have been
installed by the user) to fit a polytomous logistic model to the
current working transformations of all the other variables (using
conditional mean imputation for missing predictors).  Multiple
imputations are made by drawing multinomial values from the vector of
predicted probabilities of category membership for the missing
categorical values.
}
\item{mincut}{
If \code{imputed=TRUE}, there are categorical variables, and \code{impcat="tree"},
\code{mincut} specifies the lowest node size that will be allowed to be
split by \code{tree}.  The default is 40.
}
\item{inverse}{
By default, imputed values are back-solved on the original scale using
inverse linear interpolation on the fitted tabulated transformed values.
This will cause distorted distributions of imputed values (e.g., floor
and ceiling effects) when the estimated transformation has a flat or
nearly flat section.  To instead use the \code{invertTabulated} function
(see above) with the \code{"sample"} option, specify \code{inverse="sample"}.
}
\item{tolInverse}{
the multiplyer of the range of transformed values, weighted by \code{freq}
and by the distance measure, for determining the set of x 
values having y values within a tolerance of the value of \code{aty} in
\code{invertTabulated}.  For \code{predict.transcan}, \code{inverse} and
\code{tolInverse} are obtained from options that were specified to
\code{transcan} by default.  Otherwise, if not specified by the user, these
default to the defaults used to \code{invertTabulated}.
}
\item{pr}{
For \code{transcan}, set to \code{FALSE} to suppress printing r-squares
and shrinkage factors.  For \code{impute.transcan} set to \code{FALSE}
to suppress messages concerning the number of NAs imputed, or for
\code{fit.mult.impute} set to \code{FALSE} to suppress printing variance
inflation factors accounting for imputation, rate of missing
information, and degrees of freedom.
}
\item{pl}{
Set to \code{FALSE} to suppress plotting the final transformations with 
distribution of scores for imputed values (if \code{show.na=TRUE}).
}
\item{allpl}{
Set to \code{TRUE} to plot transformations for intermediate iterations.
}
\item{show.na}{
Set to \code{FALSE} to suppress the distribution of scores assigned to
missing values (as tick marks on the right margin of each graph).
See also \code{imputed}.
}
\item{imputed.actual}{
The default is \code{"none"} to suppress plotting of actual vs. imputed
values for all variables having any NAs.   Other choices are
\code{"datadensity"} to use \code{datadensity} to make a single plot, \code{"hist"}
to make a series of back-to-back histograms, \code{"qq"} to make a series
of q-q plots, or \code{"ecdf"} to make a series of empirical cdfs.  For
\code{imputed.actual="datadensity"} for example you get 
a rug plot of the non-missing values for the variable with beneath it
a rug plot of the imputed values.
When \code{imputed.actual} is not \code{"none"}, \code{imputed} is automatically set
to \code{TRUE}.
}
\item{iter.max}{
maximum number of iterations to perform for \code{transcan} or \code{predict}.
For \code{predict}, only one iteration is used if there
are no NAs in the data or if \code{imp.con} was used.
}
\item{eps}{
convergence criterion for \code{transcan} and \code{predict}.  \code{eps} is the
maximum change in transformed values from one iteration to the next. 
If for a given iteration all new transformations
of variables differ by less than \code{eps} (with or without negating the
transformation to allow for "flipping") from the transformations in
the previous iteration, one more iteration is done for \code{transcan}.  
During this
last iteration, individual transformations are not updated but
coefficients of transformations are.  This improves stability of
coefficients of canonical variates on the right-hand-side.
\code{eps} is ignored when \code{rhsImp="random"}.
}
\item{curtail}{
for \code{transcan}, causes imputed values on the transformed scale to
be truncated so that their ranges are within the ranges of 
non-imputed transformed values.
For \code{predict}, \code{curtail} defaults to \code{TRUE} to truncate predicted transformed
values to their ranges in the original fit (\code{xt}).
}
\item{imp.con}{
for \code{transcan}, set to \code{TRUE} to impute NAs on the original scales with
constants (medians or most frequent category codes).  Set to a vector
of constants to instead always use these constants for imputation.
These imputed values are ignored when fitting the current working
transformation for a single variable.
}
\item{shrink}{
default is \code{FALSE} to use ordinary least squares or canonical variate estimates.
For the purposes of imputing NAs, you may want to set \code{shrink=TRUE} to avoid
overfitting when developing a prediction equation to predict each variables
from all the others (see details below).
}
\item{init.cat}{
method for initializing scorings of categorical variables.  Default is
\code{"mode"} to use a dummy variable set to 1 if the value is the most
frequent value (this is the default).  
Use \code{"random"} to use a random 0-1 variable.  Set
to \code{"asis"} to use the original integer codes as starting scores.
}
\item{nres}{
number of residuals to store if \code{n.impute} is specified.  If the
dataset has fewer than \code{nres} observations, all residuals are saved.
Otherwise a random sample of the residuals of length \code{nres} without
replacement is saved.  The default for \code{nres} is higher if
\code{boot.method="approximate bayesian"}.
}
\item{data}{
}
\item{subset}{an integer or logical vector specifying the subset of
  observations to fit}
\item{na.action}{
These may be used if \code{x} is a formula.  The default \code{na.action} is
\code{na.retain} (defined by \code{transcan}) which keeps all observations with
any \code{NA}s.
For \code{impute.transcan}, \code{data} is a data frame to use as the source of
variables to be imputed, rather than using \code{where.in}.  For
\code{fit.mult.impute}, \code{data} is mandatory and is a data frame containing
the data to be used in fitting the model but before imputations
are applied.  Variables omitted from \code{data} are assumed to be
available from frame 1 and do not need to be imputed.
}
\item{treeinfo}{
Set to \code{TRUE} to get additional information printed when \code{impcat="tree"},
such as the predicted probabilities of category membership.
}
\item{rhsImp}{
Set to \code{"random"} to use random draw imputation when a sometimes
missing variable is moved to be a predictor of other sometimes missing
variables.  Default is \code{rhsImp="mean"}, which uses conditional mean
imputation on the transformed scale.  Residuals used are residuals
from the transformed scale.  When \code{"random"} is used, \code{transcan} runs
5 iterations and ignores \code{eps}.
}
\item{details.impcat}{set to a character scalar that is the name of a
	category variable to include in the resulting \code{transcan} object
	an element \code{details.impcat} containing details of how the
	categorical variable was multiply imputed.}
\item{...}{
arguments passed to \code{scat1d} or to the \code{fitter} function (for
\code{fit.mult.impute})
}
\item{long}{
for \code{summary}, set to \code{TRUE} to print all imputed values.
For \code{print}, set to \code{TRUE} to print details of transformations/imputations.
}
\item{var}{
For \code{impute}, is a variable that was originally a column in \code{x}, for
which imputated values are to be filled in. \code{imputed=TRUE} must have been
used in \code{transcan}.  Omit \code{var} to impute all variables, creating new
variables in \code{search} position \code{where}.
}
\item{imputation}{
specifies which of the multiple imputations to use for filling in NAs
}
\item{name}{
name of variable to impute, for \code{impute()}.  Default is character
string version of the second argument (\code{var}) in the call to
\code{impute}. For \code{invertTabulated}, is the name of variable being
transformed (used only for warning messages).
}
\item{where.in}{
location in \code{search} list to find variables that need to be imputed, when
all variables are to be imputed automatically by \code{impute.transcan}
(i.e., when no input variable name is specified).
Default is first \code{search} position that contains the first variable to
be imputed.
}
\item{where.out}{
location in the \code{search} list for storing variables with missing values
set to imputed values, for \code{impute.transcan} when all variables with
missing values are being imputed automatically.
}
\item{frame.out}{
Instead of specifying \code{where.out} you can specify an S frame
number into which individual new imputed variables will be written.
For example, \code{frame.out=1} is useful for putting new variables into a
temporary local frame when \code{impute} is called within another function
(see \code{fit.mult.impute}).  See \code{assign} for details about frames.
}
\item{list.out}{
If \code{var} is not specified, you can set \code{list.out=TRUE} to have
\code{impute.transcan} return a list containing variables with needed
values imputed.  This list will contain a single imputation.
}
\item{check}{
set to \code{FALSE} to suppress certain warning messages
}
\item{newdata}{
a new data matrix for which to compute transformed variables.
Categorical variables must use the same integer codes as were used
in the call to \code{transcan}.  If a formula was originally specified to
\code{transcan} (instead of a data matrix), \code{newdata} is optional and if
given must be a data frame; a model
frame is generated automatically from the previous formula.  The
\code{na.action} is handled automatically, and the levels for factor variables
must be the same and in the same order as were used in the original
variables specified in the formula given to \code{transcan}.
}
\item{fit.reps}{
set to \code{TRUE} to save all fit objects from the fit for each imputation in
\code{fit.mult.impute}.  Then the object returned will have a component
\code{fits} which is a list whose \code{i}th element is the \code{i}th fit object.
}
\item{derived}{
an expression containing S expressions for computing derived
variables that are used in the model formula.  This is useful when
multiple imputations are done for component variables but the actual
model uses combinations of these (e.g., ratios or other derivations).
For a single derived variable you can specified for example
\code{derived=expression(ratio <- weight/height)}.  For multiple derived
variables use the form \code{derived=expression(\{ratio <- weight/height;
product <- weight*height\})} or put the expression on separate input
lines.   To monitor the multiply-imputed derived
variables you can add to the \code{expression} a command such as
\code{print(describe(ratio))}.  See the example below.
}
\item{type}{
By default, the matrix of transformed variables is returned, with imputed
values on the transformed scale.  If you had specified \code{trantab=TRUE} to
\code{transcan}, specifying \code{type="original"} does the table look-ups with
linear interpolation to return the input matrix \code{x} but with imputed
values on the original scale inserted for NAs.  For categorical variables,
the method used here is to select 
the category code having a corresponding scaled value closest to the
predicted transformed value.  This corresponds to the default \code{impcat};
a problem in getting predicted
values for \code{tree} objects prevented using \code{tree} for this.  Note:
imputed values thus returned when \code{type="original"} are single
expected value imputations even in \code{n.impute} is given.
}
\item{object}{an object created by  \code{transcan}, or an object to be
  converted to S function code, typically a model fit object of some sort}
\item{prefix}{
}
\item{suffix}{
When creating separate S functions for each variable in \code{x}, the name
of the new function will be \code{prefix} placed in front of the variable name,
and \code{suffix} placed in back of the name.  The default is to use names
of the form \code{.varname}, where \code{varname} is the variable name.
}
\item{where}{
position in \code{search} list at which to store new functions (for \code{Function}).
Default is position 1 in the search list.  See the \code{assign} function for more
documention on the \code{where} argument.
}
\item{y}{
a vector corresponding to \code{x} for \code{invertTabulated}, if its first
argument \code{x} is not a list
}
\item{freq}{
a vector of frequencies corresponding to cross-classified \code{x} and \code{y}
if \code{x} is not a list.  Default is a vector of ones.
}
\item{aty}{
vector of transformed values at which inverses are desired
}
\item{rule}{
see \code{approx}.  \code{transcan} assumes \code{rule} is always \code{2}
}
\item{regcoef.only}{set to \code{TRUE} to make \code{Varcov.default}
  delete positions in the covariance matrix for any non-regression
  coefficients (e.g., log scale parameter from \code{psm} or \code{survreg})}
}
\value{
For \code{transcan}, a list of class \code{transcan} with elements
\code{call} (with the function call), \code{iter} (number of
iterations done) and \code{rsq} and \code{rsq.adj} containing the R-squares and
adjusted R-squares achieved in predicting each variable from all the
others.  It also has elements \code{categorical}, \code{asis}, \code{coef},
\code{xcoef}, \code{parms}, \code{fillin}, \code{ranges}, \code{scale}, and \code{formula}
containing respectively the values supplied for \code{categorical} and
\code{asis}, the within-variable coefficients used to compute the first
canonical variate, the (possibly shrunk) across-variables coefficients
of the first canonical variate that predicts each variable in turn,
the parameters of the transformation (knots for splines, contrast
matrix for categorical variables), the initial estimates for missing
values (NA if variable never missing), the matrix of ranges of the
transformed variables (min and max in first and second row), a vector
of scales used to determine convergence for a transformation, the
formula (if \code{x} was a formula), and optionally a vector of shrinkage
factors used for predicting each variable from the others.  For
\code{"asis"} variables, the scale is the average absolute difference about
the median.  For other variables it is unity, since canonical
variables are standardized.  For \code{xcoef}, row \code{i} has the coefficients
to predict transformed variable \code{i}, with the column for the
coefficient of variable \code{i} set to NA.  If \code{imputed=TRUE} was given, an
optional element \code{imputed} also appears.  This is a list with the
vector of imputed values (on the original scale) for each variable
containing NAs.  Matrices rather than vectors are returned if
\code{n.impute} is given.  If \code{trantab=TRUE, the `trantab} element also
appears, as described above.  If \code{n.impute > 0}, \code{transcan} also returns
a list \code{residuals} that can be used for future multiple imputation.


\code{impute} returns a vector (the same
length as \code{var}) of class \code{"impute"} with NAs imputed.  \code{predict}
returns a matrix with the same number of columns or variables as were
in \code{x}.


\code{fit.mult.impute} returns a fit object that is a modification of the
fit object created by fitting the completed dataset for the final
imputation.  The \code{var} matrix in the fit object has the
imputation-corrected variance-covariance matrix.  \code{coefficients} is
the average (over imputations) of the coefficient vectors,
\code{variance.inflation.impute} is a vector containing the ratios of
the diagonals of the between-imputation variance matrix to the diagonals
of the average apparent (within-imputation) variance matrix.
\code{missingInfo} is Rubin's "rate of missing information" and
\code{dfmi} is Rubin's degrees of freedom for a t-statistic for testing
a single parameter.  The last two objects are vectors corresponding to
the diagonal of the variance matrix.
}
\section{Side Effects}{
prints, plots, and \code{impute.transcan} creates new variables.
}
\details{
The starting approximation to the transformation for each variable
is taken to be the original coding of the variable.  The initial
approximation for each missing value is taken to be the median of
the non-missing values for the variable (for continuous ones) or
the most frequent category (for categorical ones).  Instead, if \code{imp.con} is
a vector, its values are used for imputing NAs.  When using each
variable as a dependent variable, NAs on that variable cause all
observations to be temporarily deleted.  Once a new working transformation
is found for the variable, along with a model to predict that transformation
from all the other variables, that latter model is used to impute
NAs in the selected dependent variable if \code{imp.con} is not specified.  
When that variable is used
to predict a new dependent variable, the current working imputed values
are inserted.  Transformations are updated after each variable becomes
a dependent variable, so the order of variables on \code{x} could conceivably
make a difference in the final estimates.  For obtaining out-of-sample
predictions/transformations, \code{predict} uses the same iterative
procedure as \code{transcan} for imputation, with the same starting
values for fill-ins as were used by \code{transcan}.  It also (by default)
uses a conservative approach of curtailing transformed variables to
be within the range of the original ones.
Even when \code{method="pc"} is specified, canonical variables are used
for imputing missing values.


Note that fitted transformations, when evaluated at imputed variable
values (on the original scale), will not precisely match the transformed
imputed values returned in \code{xt}.  This is because \code{transcan} uses an
approximate method based on linear interpolation to back-solve for
imputed values on the original scale.


Shrinkage uses the method of Van Houwelingen and Le Cessie (1990) (similar to 
Copas, 1983).  The shrinkage factor is \code{[1-(1-R2)(n-1)/(n-k-1)]/R2}, where
\code{R2} is the apparent R-squared for predicting the variable, \code{n} is the number
of non-missing values, and \code{k} is the effective number of degrees of freedom
(aside from intercepts).  A heuristic estimate is used for \code{k}:
\code{A - 1 + sum(max(0,Bi-1))/m + m}, where 
\code{A} is the number of d.f. required
to represent the variable being predicted, the \code{Bi} are the number of
columns required to represent all the other variables, and \code{m} is the
number of all other variables.  Division by \code{m} is done because the
transformations for the other variables are fixed at their current
transformations the last time they were being predicted.  The \code{+ m} term
comes from the number of coefficients estimated on the right hand side,
whether by least squares or canonical variates.  If a shrinkage factor
is negative, it is set to 0.  The shrinkage factor is the ratio of
the adjusted R-squared to the ordinary R-squared.
The adjusted R-squared is \code{1 - (1 - R2)(n-1)/(n-k-1)}, which is also set to
zero if it is negative.  If \code{shrink=FALSE} and the adjusted R-squares are much 
smaller than
the ordinary R-squares, you may want to run \code{transcan} with \code{shrink=TRUE}.


Canonical variates are scaled to have variance of 1.0, by multiplying canonical
coefficients from \code{cancor} by \code{sqrt(n-1)}.


When specifying a non-Design library fitting function to
\code{fit.mult.impute} (e.g., \code{lm}, \code{glm}), running the result of
\code{fit.mult.impute} through that fit's \code{summary} method will not use the
imputation-adjusted variances.  You may obtain the new variances using
\code{fit$var} or \code{Varcov(fit)}.  


When you specify a Design function to \code{fit.mult.impute} (e.g.,
\code{lrm, ols, cph, psm, bj}), automatically computed transformation
parameters (e.g., knot locations for \code{rcs}) that are estimated for the
first imputation are used for all other imputations.  This ensures
that knot locations will not vary, which would change the meaning of
the regression coefficients.


Warning: even though \code{fit.mult.impute} takes imputation into account
when estimating variances of regression coefficient, it does not take
into account the variation that results from estimation of the shapes
and regression coefficients of the customized imputation equations.
Specifying \code{shrink=TRUE} solves a small part of this problem.  To fully
account for all sources of variation you should consider putting the
\code{transcan} invocation inside a bootstrap or loop, if execution time
allows.  Better still, use \code{aregImpute} or one of the libraries such
as MICE that uses real Bayesian posterior realizations to multiply
impute missing values correctly.


It is strongly recommended that you use the Hmisc \code{naclus} function to
determine is there is a good basis for imputation.  \code{naclus} will tell
you, for example, if systolic blood pressure is missing whenever
diastolic blood pressure is missing.  If the only variable that is
well correlated with diastolic bp is systolic bp, there is no basis
for imputing diastolic bp in this case.


At present, \code{predict} does not work with multiple imputation.


When calling \code{fit.mult.impute} with \code{glm} as the \code{fitter} argument, if
you need to pass a \code{family} argument to \code{glm} do it by quoting the
family, e.g., \code{family="binomial"}.


You should be able to use a variable in the formula given to
\code{fit.mult.impute} as a numeric variable in the regression model even
though it was a factor variable in the invocation of \code{transcan}.  Use
for example \code{fit.mult.impute(y ~ codes(x), lrm, trans)} (thanks to
Trevor Thompson \email{trevor@hp5.eushc.org}).
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
\email{f.harrell@vanderbilt.edu}
}
\references{
Kuhfeld, Warren F: The PRINQUAL Procedure.  SAS/STAT User's Guide, Fourth
Edition, Volume 2, pp. 1265--1323, 1990.

Van Houwelingen JC, Le Cessie S: Predictive value of statistical models.
Statistics in Medicine 8:1303--1325, 1990.

Copas JB: Regression, prediction and shrinkage. JRSS B 45:311--354, 1983.

He X, Shen L: Linear regression after spline transformation.
Biometrika 84:474--481, 1997.

Little RJA, Rubin DB: Statistical Analysis with Missing Data.  New
York: Wiley, 1987.

Rubin DJ, Schenker N: Multiple imputation in health-care databases: An
overview and some applications.  Stat in Med 10:585--598, 1991.

Faris PD, Ghali WA, et al:Multiple imputation versus data enhancement
for dealing with missing data in observational health care outcome
analyses.  J Clin Epidem 55:184--191, 2002.
}
\seealso{
\code{\link{aregImpute}}, \code{\link{impute}}, \code{\link{naclus}}, \code{\link{naplot}},
\code{\link[acepack]{ace}}, \code{\link[acepack]{avas}}, \code{\link{cancor}}, \code{\link{prcomp}}, \code{\link{rcspline.eval}}, 
\code{\link{lsfit}}, \code{\link{approx}}, \code{\link{datadensity}}, \code{\link[mice]{mice}}
}
\examples{
\dontrun{
x <- cbind(age, disease, blood.pressure, pH)  
#cbind will convert factor object `disease' to integer
par(mfrow=c(2,2))
x.trans <- transcan(x, categorical="disease", asis="pH",
                    transformed=TRUE, imputed=TRUE)
summary(x.trans)  #Summary distribution of imputed values, and R-squares
f <- lm(y ~ x.trans$transformed)   #use transformed values in a regression
#Now replace NAs in original variables with imputed values, if not
#using transformations
age            <- impute(x.trans, age)
disease        <- impute(x.trans, disease)
blood.pressure <- impute(x.trans, blood.pressure)
pH             <- impute(x.trans, pH)
#Do impute(x.trans) to impute all variables, storing new variables under
#the old names
summary(pH)       #uses summary.impute to tell about imputations
                  #and summary.default to tell about pH overall
# Get transformed and imputed values on some new data frame xnew
newx.trans     <- predict(x.trans, xnew)
w              <- predict(x.trans, xnew, type="original")
age            <- w[,"age"]            #inserts imputed values
blood.pressure <- w[,"blood.pressure"]
Function(x.trans)  #creates .age, .disease, .blood.pressure, .pH()
#Repeat first fit using a formula
x.trans <- transcan(~ age + disease + blood.pressure + I(pH), 
                    imputed=TRUE)
age <- impute(x.trans, age)
predict(x.trans, expand.grid(age=50, disease="pneumonia",
        blood.pressure=60:260, pH=7.4))
z <- transcan(~ age + factor(disease.code),  # disease.code categorical
              transformed=TRUE, trantab=TRUE, imputed=TRUE, pl=FALSE)
plot(z$transformed)
}


# Multiple imputation and estimation of variances and covariances of
# regression coefficient estimates accounting for imputation
set.seed(1)
x1 <- factor(sample(c('a','b','c'),100,TRUE))
x2 <- (x1=='b') + 3*(x1=='c') + rnorm(100)
y  <- x2 + 1*(x1=='c') + rnorm(100)
x1[1:20] <- NA
x2[18:23] <- NA
d <- data.frame(x1,x2,y)
n <- naclus(d)
plot(n); naplot(n)  # Show patterns of NAs
f  <- transcan(~y + x1 + x2, n.impute=10, shrink=FALSE, data=d)
options(digits=3)
summary(f)


f  <- transcan(~y + x1 + x2, n.impute=10, shrink=TRUE, data=d)
summary(f)


h <- fit.mult.impute(y ~ x1 + x2, lm, f, data=d)
# Add ,fit.reps=TRUE to save all fit objects in h, then do something like:
# for(i in 1:length(h$fits)) print(summary(h$fits[[i]]))


diag(Varcov(h))


h.complete <- lm(y ~ x1 + x2, na.action=na.omit)
h.complete
diag(Varcov(h.complete))


# Note: had Design's ols function been used in place of lm, any
# function run on h (anova, summary, etc.) would have automatically
# used imputation-corrected variances and covariances


# Example demonstrating how using the multinomial logistic model
# to impute a categorical variable results in a frequency
# distribution of imputed values that matches the distribution
# of non-missing values of the categorical variable


\dontrun{
set.seed(11)
x1 <- factor(sample(letters[1:4], 1000,TRUE))
x1[1:200] <- NA
table(x1)/sum(table(x1))
x2 <- runif(1000)
z  <- transcan(~ x1 + I(x2), n.impute=20, impcat='multinom')
table(z$imputed$x1)/sum(table(z$imputed$x1))
}


# Example where multiple imputations are for basic variables and
# modeling is done on variables derived from these


set.seed(137)
n <- 400
x1 <- runif(n)
x2 <- runif(n)
y  <- x1*x2 + x1/(1+x2) + rnorm(n)/3
x1[1:5] <- NA
d <- data.frame(x1,x2,y)
w <- transcan(~ x1 + x2 + y, n.impute=5, data=d)
# Add ,show.imputed.actual for graphical diagnostics
\dontrun{
g <- fit.mult.impute(y ~ product + ratio, ols, w,
                     data=data.frame(x1,x2,y),
                     derived=expression({
                       product <- x1*x2
                       ratio   <- x1/(1+x2)
                       print(cbind(x1,x2,x1*x2,product)[1:6,])}))
}


# Here's a method for creating a permanent data frame containing
# one set of imputed values for each variable specified to transcan
# that had at least one NA, and also containing all the variables
# in an original data frame.  The following is based on the fact
# that the default output location for impute.transcan is
# given by where.out=1 (search position 1)


\dontrun{
xt <- transcan(~. , data=mine,
               imputed=TRUE, shrink=TRUE, n.impute=10, trantab=TRUE)
attach(mine, pos=1, use.names=FALSE)
impute(xt, imputation=1) # use first imputation
# omit imputation= if using single imputation
detach(1, 'mine2')
}


# Example of using invertTabulated outside transcan
x    <- c(1,2,3,4,5,6,7,8,9,10)
y    <- c(1,2,3,4,5,5,5,5,9,10)
freq <- c(1,1,1,1,1,2,3,4,1,1)
# x=5,6,7,8 with prob. .1 .2 .3 .4 when y=5
# Within a tolerance of .05*(10-1) all y's match exactly
# so the distance measure does not play a role
set.seed(1)      # so can reproduce
for(inverse in c('linearInterp','sample'))
 print(table(invertTabulated(x, y, freq, rep(5,1000), inverse=inverse)))


# Test inverse='sample' when the estimated transformation is
# flat on the right.  First show default imputations
set.seed(3)
x <- rnorm(1000)
y <- pmin(x, 0)
x[1:500] <- NA
for(inverse in c('linearInterp','sample')) {
par(mfrow=c(2,2))
  w <- transcan(~ x + y, imputed.actual='hist',
                inverse=inverse, curtail=FALSE,
                data=data.frame(x,y))
  if(inverse=='sample') next
# cat('Click mouse on graph to proceed\n')
# locator(1)
}
}
\keyword{smooth}
\keyword{regression}
\keyword{multivariate}
\keyword{methods}
\keyword{models}
\concept{bootstrap}
% Converted by Sd2Rd version 1.21.

\eof
\name{translate}
\alias{translate}
\title{
Translate Vector or Matrix of Text Strings
}
\description{
Uses the UNIX tr command to translate any character in \code{old} in
\code{text} to the corresponding character in \code{new}.  If multichar=T
or \code{old} and \code{new} have more than one element, or each have one element
but they have different numbers of characters,
uses the UNIX \code{sed} command to translate the series of characters in
\code{old} to the series in \code{new} when these characters occur in \code{text}.
If \code{old} or \code{new} contain a backslash, you sometimes have to quadruple
it to make the UNIX command work. If they contain a forward slash,
preceed it by two backslashes.  The Microsoft Windows version of
\code{translate} invokes the \code{sedit()} function and does not allow
\code{multichar=FALSE}, i.e., it does not support the UNIX \code{tr} function.
The R version of \code{translate} invokes the builtin chartr function if
\code{multichar=FALSE}. 
}
\usage{
translate(text, old, new, multichar=FALSE)
}
\arguments{
\item{text}{
scalar, vector, or matrix of character strings to translate.
}
\item{old}{
vector old characters
}
\item{new}{
corresponding vector of new characters
}
\item{multichar}{See above.}
}
\value{
an object like text but with characters translated
}
\seealso{
unix, grep
}
\details{
At present, \code{multichar=FALSE}, which requires the UNIX \code{tr} program, is not
implemented under MS Windows.
}
\examples{
translate(c("ABC","DEF"),"ABCDEFG", "abcdefg")
translate("23.12","[.]","\\\\\\cdot ") # change . to \\cdot
translate(c("dog","cat","tiger"),c("dog","cat"),c("DOG","CAT"))
# S-Plus gives  [1] "DOG"   "CAT"   "tiger" - check discrepency
translate(c("dog","cat2","snake"),c("dog","cat"),"animal")
# S-Plus gives  [1] "animal"  "animal2" "snake" 
}
\keyword{character}
% Converted by Sd2Rd version 1.21.

\eof
\name{units}
\alias{units}
\alias{units.default}
\alias{units<-}
\title{
Units Attribute of a Vector  
}
\description{
Sets or retrieves the \code{"units"} attribute of an object.
For \code{units.default} replaces the builtin
version, which only works for time series objects.  If the variable is
also given a \code{label}, subsetting (using \code{[.labelled}) will
retain the \code{"units"} attribute.  For S-Plus 6 which uses version 4
of the S language, the latter does not work.
}
\usage{
units(x, ...)
\method{units}{default}(x, none='', \dots)
units(x) <- value
}
\arguments{
\item{x}{any object}
\item{\dots}{ignored}
\item{value}{the units of the object, or ""}
\item{none}{value to which to set result if no appropriate attribute is
  found}
}
\value{
the units attribute of x, if any; otherwise, the \code{units} attribute of
the \code{tspar} attribute of \code{x} if any; otherwise the value \code{none}
}
\seealso{\code{\link{label}}}
\examples{
fail.time <- c(10,20)
units(fail.time) <- "Day"
describe(fail.time)
label(fail.time) <- 'Failure Time'
fail.time
\dontrun{
f <- cph(Surv(fail.time, event) ~ xx)
plot(xx,xx2,xlab=paste(label(xx),", ",units(xx),"s",sep=""))
}
}
\keyword{utilities}
\keyword{interface}
% Converted by Sd2Rd version 1.21.

\eof
\name{upData}
\alias{cleanup.import}
\alias{upData}
\alias{exportDataStripped}
\alias{csv.get}
\title{
Update a Data Frame or Cleanup a Data Frame after Importing
}
\description{
\code{cleanup.import} will correct errors and shrink
the size of data frames created by the S-Plus \code{File \dots Import}
dialog or by other methods such as \code{scan} and \code{read.table}.  By
default, double precision numeric variables are changed to single
precision (S-Plus only) or to integer when they contain no fractional
components. 
Infinite values or values greater than 1e20 in absolute value are set
to NA.  This solves problems of importing Excel spreadsheets that
contain occasional character values for numeric columns, as S-Plus
converts these to \code{Inf} without warning.  There is also an option to
convert variable names to lower case and to add labels to variables.
The latter can be made easier by importing a CNTLOUT dataset created
by SAS PROC FORMAT and using the \code{sasdict} option as shown in the
example below.  \code{cleanup.import} can also transform character or
factor variables to dates.

\code{upData} is a function facilitating the updating of a data frame
without attaching it in search position one.  New variables can be
added, old variables can be modified, variables can be removed or renamed, and
\code{"labels"} and \code{"units"} attributes can be provided.  Various checks
are made for errors and inconsistencies, with warnings issued to help
the user.  Levels of factor variables
can be replaced, especially using the \code{list} notation of the standard
\code{merge.levels} function.  Unless \code{force.single} is set to \code{FALSE},
\code{upData} also converts double precision vectors to single precision
(if not under R), or to integer if no fractional values are present in
a vector.

Both \code{cleanup.import} and \code{upData} will fix a problem with
data frames created under S-Plus before version 5 that are used in S-Plus 5 or
later.  The problem was caused by use of the \code{label} function
to set a variable's class to \code{"labelled"}.  These classes are
removed as the S version 4 language does not support multiple
inheritance.  Failure to run data frames through one of the two
functions when these conditions apply will result in simple numeric
variables being set to \code{factor} in some cases.  Extraneous \code{"AsIs"}
classes are also removed.

For S-Plus, a function \code{exportDataStripped} is provided that allows
exporting of data to other systems 
by removing attributes \code{label, imputed, format, units}, and
\code{comment}.  It calls \code{exportData} after stripping these
attributes.  Otherwise \code{exportData} will fail.

\code{csv.get} reads comma-separated text data files, allowing optional
translation to lower case for variable names after making them valid S
names.  Original possibly non-legal names are taken to be variable
labels.  Character or factor variables containing dates can be converted
to date variables.  \code{cleanup.import} is invoked to finish the job.
}
\usage{
cleanup.import(obj, labels, lowernames=FALSE, 
               force.single=TRUE, force.numeric=TRUE, rmnames=TRUE,
               big=1e20, sasdict, pr, datevars=NULL, dateformat='%d%b%Y')

upData(object, \dots, 
       rename, drop, labels, units, levels,
       force.single=TRUE, lowernames=FALSE, moveUnits=FALSE)

exportDataStripped(data, \dots)

csv.get(file, lowernames=FALSE, datevars=NULL, dateformat='\%d\%b\%Y', \dots)
}
\arguments{
\item{obj}{a data frame or list}
\item{object}{a data frame or list}
\item{data}{a data frame}
\item{force.single}{
By default, double precision variables are converted to single precision
(in S-Plus only) unless \code{force.single=FALSE}.
\code{force.single=TRUE} will also convert vectors having only integer
values to have a storage mode of integer, in R or S-Plus.
}
\item{force.numeric}{
Sometimes importing will cause a numeric variable to be
changed to a factor vector.  By default, \code{cleanup.import} will check
each factor variable to see if the levels contain only numeric values
and \code{""}.  In that case, the variable will be converted to numeric,
with \code{""} converted to NA.  Set \code{force.numeric=FALSE} to prevent
this behavior. 
}
\item{rmnames}{
set to `F' to not have `cleanup.import' remove `names' or `.Names'
attributes from variables
}
\item{labels}{
a character vector the same length as the number of variables in
\code{obj}.  These character values are taken to be variable labels in the
same order of variables in \code{obj}.
For \code{upData}, \code{labels} is a named list or named vector with variables
in no specific order.
}
\item{lowernames}{
set this to \code{TRUE} to change variable names to lower case.
\code{upData} does this before applying any other changes, so variable
names given inside arguments to \code{upData} need to be lower case if
\code{lowernames==TRUE}. 
}
\item{big}{
a value such that values larger than this in absolute value are set to
missing by \code{cleanup.import}
}
\item{sasdict}{
the name of a data frame containing a raw imported SAS PROC CONTENTS
CNTLOUT= dataset.  This is used to define variable names and to add
attributes to the new data frame specifying the original SAS dataset
name and label.
}
\item{pr}{
set to \code{TRUE} or \code{FALSE} to force or prevent printing of the current
variable number being processed.  By default, such messages are printed if the
product of the number of variables and number of observations in \code{obj}
exceeds 500,000.
}
\item{datevars}{character vector of names (after \code{lowernames} is
  applied) of variables to consider as a factor or character vector
  containing dates in a format matching \code{dateformat}}
\item{dateformat}{for \code{cleanup.import} is the input format (see
  \code{\link{strptime}})}
\item{\dots}{
for \code{upData}, one or more expressions of the form
\code{variable=expression}, to derive new variables or change old ones.
For \code{exportDataStripped}, optional arguments that are passed to
\code{exportData}.  For \code{csv.get}, arguments to pass to
\code{read.csv}.
}
\item{rename}{
list or named vector specifying old and new names for variables.  Variables are
renamed before any other operations are done.  For example, to rename
variables \code{age} and \code{sex} to respectively \code{Age} and
\code{gender}, specify \code{rename=list(age="Age", sex="gender")} or
\code{rename=c(age=\dots)}. 
}
\item{drop}{
a vector of variable names to remove from the data frame
}
\item{units}{
a named vector or list defining \code{"units"} attributes of variables, in no
specific order
}
\item{levels}{
a named list defining \code{"levels"} attributes for factor variables, in
no specific order.  The values in this list may be character vectors
redefining \code{levels} (in order) or another list (see
\code{merge.levels} if using S-Plus).
}
\item{moveUnits}{
  set to \code{TRUE} to look for units of measurements in variable
  labels and move them to a \code{"units"} attribute.  If an expression
  in a label is enclosed in parentheses or brackets it is assumed to be
  units if \code{moveUnits=TRUE}.
}
\item{file}{a file name to import}
}
\value{a new data frame}
\author{
Frank Harrell, Vanderbilt University
}
\seealso{
\code{\link{sas.get}}, \code{\link{data.frame}}, \code{\link{describe}},
\code{\link{label}}, \code{\link{read.csv}}, \code{\link{strptime}},
\code{\link{POSIXct}}
}
\examples{
\dontrun{
dat <- read.table('myfile.asc')
dat <- cleanup.import(dat)
}
dat <- data.frame(a=(1:3)/7, y=c('a','b1','b2'), z=1:3)
dat2 <- upData(dat, x=x^2, x=x-5, m=x/10, 
               rename=c(a='x'), drop='z',
               labels=c(x='X', y='test'),
               levels=list(y=list(a='a',b=c('b1','b2'))))
dat2
describe(dat2)
dat <- dat2    # copy to original name and delete dat2 if OK
rm(dat2)

# If you import a SAS dataset created by PROC CONTENTS CNTLOUT=x.datadict,
# the LABELs from this dataset can be added to the data.  Let's also
# convert names to lower case for the main data file
\dontrun{
mydata2 <- cleanup.import(mydata2, lowernames=TRUE, sasdict=datadict)
}
}
\keyword{data}
\keyword{manip}



\eof
\name{varclus}
\alias{varclus}
\alias{print.varclus}
\alias{plot.varclus}
\alias{naclus}
\alias{naplot}
\alias{combine.levels}
\alias{plotMultSim}
\alias{na.pattern}
\title{
Variable Clustering
}
\description{
Does a hierarchical cluster analysis on variables, using the Hoeffding
D statistic, squared Pearson or Spearman correlations, or proportion
of observations for which two variables are both positive as similarity
measures.  Variable clustering is used for assessing collinearity,
redundancy, and for separating variables into clusters that can be
scored as a single variable, thus resulting in data reduction.  For
computing any of the three similarity measures, pairwise deletion of
NAs is done.  The clustering is done by \code{hclust()}.  A small function
\code{naclus} is also provided which depicts similarities in which
observations are missing for variables in a data frame.  The
similarity measure is the fraction of \code{NAs} in common between any two
variables.  The diagonals of this \code{sim} matrix are the fraction of NAs
in each variable by itself.  \code{naclus} also computes \code{na.per.obs}, the
number of missing variables in each observation, and \code{mean.na}, a
vector whose ith element is the mean number of missing variables other
than variable i, for observations in which variable i is missing.  The
\code{naplot} function makes several plots (see the \code{which} argument).

So as to not generate too many dummy variables for multi-valued
character or categorical predictors, \code{varclus} will automatically
combine infrequent cells of such variables using an auxiliary
function \code{combine.levels} that is defined here.

\code{plotMultSim} plots multiple similarity matrices, with the similarity
measure being on the x-axis of each subplot.

\code{na.pattern} prints a frequency table of all combinations of
missingness for multiple variables.  If there are 3 variables, a
frequency table entry labeled \code{110} corresponds to the number of
observations for which the first and second variables were missing but
the third variable was not missing.
}
\usage{
varclus(x, similarity=c("spearman","pearson","hoeffding","bothpos","ccbothpos"),
        type=c("data.matrix","similarity.matrix"), 
        method=if(.R.)"complete" else "compact",
        data, subset, na.action, minlev=0.05)
\method{print}{varclus}(x, ...)
\method{plot}{varclus}(x, ylab, abbrev=FALSE, legend.=FALSE, loc, maxlen, labels, \dots)

naclus(df, method)
naplot(obj, which=c('all','na per var','na per obs','mean na',
                    'na per var vs mean na'), \dots)

combine.levels(x, minlev=.05)

plotMultSim(s, x=1:dim(s)[3],
            slim=range(pretty(c(0,max(s,na.rm=TRUE)))),
            slimds=FALSE,
            add=FALSE, lty=par('lty'), col=par('col'),
            lwd=par('lwd'), vname=NULL, h=.5, w=.75, u=.05,
            labelx=TRUE, xspace=.35)

na.pattern(x)
}
\arguments{
\item{x}{
a formula,
a numeric matrix of predictors, or a similarity matrix.  If \code{x} is
a formula, \code{model.matrix} is used to convert it to a design matrix.
If the formula excludes an intercept (e.g., \code{~ a + b -1}),
the first categorical (\code{factor}) variable in the formula will have
dummy variables generated for all levels instead of omitting one for
the first level. For \code{combine.levels}, \code{x} is a character, category,
or factor vector (or other vector that is converted to factor).  For
\code{plot} and \code{print}, \code{x} is an object created by
\code{varclus}.  For \code{na.pattern}, \code{x} is a list, data frame,
or numeric matrix.

For \code{plotMultSim}, is a numeric vector specifying the ordered
unique values on the x-axis, corresponding to the third dimension of
\code{s}.
}
\item{df}{a data frame}
\item{s}{
an array of similarity matrices.  The third dimension of this array
corresponds to different computations of similarities.  The first two
dimensions come from a single similarity matrix.  This is useful for
displaying similarity matrices computed by \code{varclus}, for example.  A
use for this might be to show pairwise similarities of variables
across time in a longitudinal study (see the example below).  If
\code{vname} is not given, \code{s} must have \code{dimnames}.
}
\item{similarity}{
the default is to use squared Spearman correlation coefficients, which
will detect monotonic but nonlinear relationships.  You can also
specify linear correlation or Hoeffding's (1948) D statistic, which
has the advantage of being sensitive to many types
of dependence, including highly non-monotonic relationships.  For
binary data, or data to be made binary, \code{similarity="bothpos"} uses as
a similarity measure the proportion of observations for which two
variables are both positive.  \code{similarity="ccbothpos"} uses a
chance-corrected measure which is the proportion of observations for
which both variables are positive minus the product of the two
marginal proportions.  This difference is expected to be zero under
independence.  For diagonals, \code{"ccbothpos"} still uses the proportion
of positives for the single variable.  So \code{"ccbothpos"} is not really
a similarity measure, and clustering is not done.  This measure is
useful for plotting with \code{plotMultSim} (see the last example).
}
\item{type}{
if \code{x} is not a formula, it may be a data matrix or a similarity matrix.
By default, it is assumed to be a data matrix.
}
\item{method}{
see \code{hclust}.  The default, for both \code{varclus} and \code{naclus}, is
\code{"compact"} (for \R it is \code{"complete"}).
}
\item{data}{
}
\item{subset}{
}
\item{na.action}{
These may be specified if \code{x} is a formula.  The default \code{na.action} is
\code{na.retain}, defined by \code{varclus}.  This causes all observations to
be kept in the model frame, with later pairwise deletion of \code{NA}s.
}
\item{ylab}{
y-axis label.  Default is constructed on the basis of \code{similarity}.
}
\item{abbrev}{
set to \code{TRUE} to abbreviate variable names for plotting.  Is set to \code{TRUE}
automatically if \code{legend=TRUE}.
}
\item{legend.}{
set to \code{TRUE} to plot a legend defining the abbreviations
}
\item{loc}{
a list with elements \code{x} and \code{y} defining coordinates of the
upper left corner of the legend.  Default is \code{locator(1)}.
}
\item{maxlen}{
if a legend is plotted describing abbreviations, original labels
longer than \code{maxlen} characters are truncated at \code{maxlen}.
}
\item{labels}{
a vector of character strings containing labels corresponding to
columns in the similar matrix, if the column names of that matrix are
not to be used
}
\item{...}{
passed to \code{plclust} (or to \code{dotchart} or \code{dotchart2} for \code{naplot}).
}
\item{obj}{an object created by \code{naclus}}
\item{which}{
defaults to \code{"all"} meaning to have \code{naplot} make 4 separate
plots.  To 
make only one of the plots, use \code{which="na per var"} (dot chart of
fraction of NAs for each variable), ,\code{"na per obs"} (dot chart showing
frequency distribution of number of variables having NAs in an
observation), \code{"mean na"} (dot chart showing mean number of other
variables missing when the indicated variable is missing), or 
\code{"na per var vs mean na"}, a scatterplot showing on the x-axis the
fraction of NAs in the variable and on the y-axis the mean number of
other variables that are NA when the indicated variable is NA.
}
\item{minlev}{
the minimum proportion of observations in a cell before that cell is
combined with one or more cells.  If more than one cell has fewer than
minlev*n observations, all such cells are combined into a new cell
labeled \code{"OTHER"}.  Otherwise, the lowest frequency cell is combined
with the next lowest frequency cell, and the level name is the
combination of the two old level levels.
}
\item{slim}{
2-vector specifying the range of similarity values for scaling the
y-axes.  By default this is the observed range over all of \code{s}.
}
\item{slimds}{set to \code{slimds} to \code{TRUE} to scale diagonals and
off-diagonals separately}
\item{add}{
set to \code{TRUE} to add similarities to an existing plot (usually
specifying \code{lty} or \code{col})
}
\item{lty}{
}
\item{col}{
}
\item{lwd}{
line type, color, or line thickness for \code{plotMultSim}
}
\item{vname}{
optional vector of variable names, in order, used in \code{s}
}
\item{h}{
relative height for subplot
}
\item{w}{
relative width for subplot
}
\item{u}{
relative extra height and width to leave unused inside the subplot.
Also used as the space between y-axis tick mark labels and graph border.
}
\item{labelx}{
  set to \code{FALSE} to suppress drawing of labels in the x direction
}
\item{xspace}{
  amount of space, on a scale of 1:\code{n} where \code{n} is the number
  of variables, to set aside for y-axis labels
}
}
\value{
for \code{varclus} or \code{naclus}, a list of class \code{varclus} with elements
\code{call} (containing the calling statement), \code{sim} (similarity matrix),
\code{n} (sample size used if \code{x} was not a correlation matrix already -
\code{n} is a matrix), \code{hclust}, the object created by \code{hclust},
\code{similarity}, and \code{method}.  For \code{plot}, returns the object created by
\code{plclust}.  \code{naclus} also returns the two vectors listed under
description, and \code{naplot} returns an invisible vector that is the
frequency table of the number of missing variables per observation.
\code{plotMultSim} invisibly returns the limits of similarities used in
constructing the y-axes of each subplot.  For \code{similarity="ccbothpos"}
the \code{hclust} object is \code{NULL}.

\code{na.pattern} creates an integer vector of frequencies.
}
\details{
\code{options(contrasts= c("contr.treatment", "contr.poly"))} is issued 
temporarily by \code{varclus} to make sure that ordinary dummy variables
are generated for \code{factor} variables.  If a categorical or character
variable has no level containing at least a fraction \code{minlev} of the
data, that variable is omitted from consideration and a warning is
printed.
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\section{Side Effects}{
plots
}
\references{
Sarle, WS: The VARCLUS Procedure.  SAS/STAT User's Guide, 4th Edition,
1990.  Cary NC: SAS Institute, Inc.


Hoeffding W. (1948): A non-parametric test of independence.  Ann Math Stat
19:546--57.
}
\seealso{
\code{\link{hclust}}, \code{\link{plclust}}, \code{\link{hoeffd}}, \code{\link{rcorr}}, \code{\link{cor}}, \code{\link{model.matrix}},
\code{\link{locator}}, \code{\link{na.pattern}}
}
\examples{
set.seed(1)
x1 <- rnorm(200)
x2 <- rnorm(200)
x3 <- x1 + x2 + rnorm(200)
x4 <- x2 + rnorm(200)
x <- cbind(x1,x2,x3,x4)
v <- varclus(x, similarity="spear")  # spearman is the default anyway
v    # invokes print.varclus
print(round(v$sim,2))
plot(v)


# plot(varclus(~ age + sys.bp + dias.bp + country - 1), abbrev=TRUE)
# the -1 causes k dummies to be generated for k countries
# plot(varclus(~ age + factor(disease.code) - 1))
#


df <- data.frame(a=c(1,2,3),b=c(1,2,3),c=c(1,2,NA),d=c(1,NA,3),
                 e=c(1,NA,3),f=c(NA,NA,NA),g=c(NA,2,3),h=c(NA,NA,3))
par(mfrow=c(2,2))
for(m in if(.R.)c("ward","complete","median") else 
                c("compact","connected","average")) {
  plot(naclus(df, method=m))
  title(m)
}
naplot(naclus(df))
n <- naclus(df)
plot(n); naplot(n)
na.pattern(df)      # builtin function


x <- c(1, rep(2,11), rep(3,9))
combine.levels(x)
x <- c(1, 2, rep(3,20))
combine.levels(x)


# plotMultSim example: Plot proportion of observations
# for which two variables are both positive (diagonals
# show the proportion of observations for which the
# one variable is positive).  Chance-correct the
# off-diagonals by subtracting the product of the
# marginal proportions.  On each subplot the x-axis
# shows month (0, 4, 8, 12) and there is a separate
# curve for females and males
d <- data.frame(sex=sample(c('female','male'),1000,TRUE),
                month=sample(c(0,4,8,12),1000,TRUE),
                x1=sample(0:1,1000,TRUE),
                x2=sample(0:1,1000,TRUE),
                x3=sample(0:1,1000,TRUE))
s <- array(NA, c(3,3,4))
opar <- par(mar=c(0,0,4.1,0))  # waste less space
for(sx in c('female','male')) {
  for(i in 1:4) {
    mon <- (i-1)*4
    s[,,i] <- varclus(~x1 + x2 + x3, sim='ccbothpos', data=d,
                      subset=month==mon & sex==sx)$sim
    }
  plotMultSim(s, c(0,4,8,12), vname=c('x1','x2','x3'),
              add=sx=='male', slimds=TRUE,
              lty=1+(sx=='male'))
  # slimds=TRUE causes separate  scaling for diagonals and
  # off-diagonals
}
par(opar)
}
\keyword{cluster}
\keyword{multivariate}
\keyword{category}
\keyword{manip}



\eof
\name{wtd.mean}
\alias{wtd.mean}
\alias{wtd.var}
\alias{wtd.quantile}
\alias{wtd.ecdf}
\alias{wtd.table}
\alias{wtd.rank}
\alias{wtd.loess.noiter}
\alias{num.denom.setup}
\title{
Weighted Statistical Estimates
}
\description{
These functions compute various weighted versions of standard
estimators.  In most cases the \code{weights} vector is a vector the same
length of \code{x}, containing frequency counts that in effect expand \code{x}
by these counts.  \code{weights} can also be sampling weights, in which
setting \code{normwt} to \code{TRUE} will often be appropriate.  This results in
making \code{weights} sum to the length of the non-missing elements in
\code{x}.  \code{normwt=TRUE} thus reflects the fact that the true sample size is
the length of the \code{x} vector and not the sum of the original values of
\code{weights} (which would be appropriate had \code{normwt=FALSE}).  When \code{weights}
is all ones, the estimates are all identical to unweighted estimates
(unless one of the non-default quantile estimation options is
specified to \code{wtd.quantile}).  When missing data have already been
deleted for, \code{x}, \code{weights}, and (in the case of \code{wtd.loess.noiter}) \code{y},
specifying \code{na.rm=FALSE} will save computation time.  Omitting the
\code{weights} argument or specifying \code{NULL} or a zero-length vector will
result in the usual unweighted estimates.

\code{wtd.mean}, \code{wtd.var}, and \code{wtd.quantile} compute
weighted means, variances, and quantiles, respectively.  \code{wtd.ecdf}
computes a weighted empirical distribution function.  \code{wtd.table}
computes a weighted frequency table (although only one stratification
variable is supported at present).  \code{wtd.rank} computes weighted
ranks, using mid--ranks for ties.  This can be used to obtain Wilcoxon
tests and rank correlation coefficients.  \code{wtd.loess.noiter} is a
weighted version of \code{loess.smooth} when no iterations for outlier
rejection are desired. This results in especially good smoothing when
\code{y} is binary.

\code{num.denom.setup} is a utility function that allows one to deal with
observations containing numbers of events and numbers of trials, by
outputting two observations when the number of events and non-events
(trials - events) exceed zero.  A vector of subscripts is generated
that will do the proper duplications of observations, and a new binary
variable \code{y} is created along with usual cell frequencies (\code{weights})
for each of the \code{y=0}, \code{y=1} cells per observation.
}
\usage{
wtd.mean(x, weights=NULL, normwt="ignored", na.rm=TRUE)
wtd.var(x, weights=NULL, normwt=FALSE, na.rm=TRUE)
wtd.quantile(x, weights=NULL, probs=c(0, .25, .5, .75, 1), 
             type=c('quantile','(i-1)/(n-1)','i/(n+1)','i/n'), 
             normwt=FALSE, na.rm=TRUE)
wtd.ecdf(x, weights=NULL, 
         type=c('i/n','(i-1)/(n-1)','i/(n+1)'), 
         normwt=FALSE, na.rm=TRUE)
wtd.table(x, weights=NULL, type=c('list','table'), 
          normwt=FALSE, na.rm=TRUE)
wtd.rank(x, weights=NULL, normwt=FALSE, na.rm=TRUE)
wtd.loess.noiter(x, y, weights=rep(1,n), robust=rep(1,n), 
                 span=2/3, degree=1, cell=.13333, 
                 type=c('all','ordered all','evaluate'), 
                 evaluation=100, na.rm=TRUE)
num.denom.setup(num, denom)
}
\arguments{
\item{x}{
a numeric vector (may be a character or \code{category} or \code{factor} vector
for \code{wtd.table})
}
\item{num}{
vector of numerator frequencies
}
\item{denom}{
vector of denominators (numbers of trials)
}
\item{weights}{
a numeric vector of weights
}
\item{normwt}{
specify \code{normwt=TRUE} to make \code{weights} sum to \code{length(x)} after deletion
of NAs
}
\item{na.rm}{
set to \code{FALSE} to suppress checking for NAs
}
\item{probs}{
a vector of quantiles to compute.  Default is 0 (min), .25, .5, .75, 1
(max).
}
\item{type}{
For \code{wtd.quantile}, \code{type} defaults to \code{quantile} to use the same
interpolated order statistic method as \code{quantile}.  Set \code{type} to 
\code{"(i-1)/(n-1)"},\code{"i/(n+1)"}, or \code{"i/n"} to use the inverse of the
empirical distribution function, using, respectively, (wt - 1)/T,
wt/(T+1), or wt/T, where wt is the cumulative weight and T is the
total weight (usually total sample size).  These three values of
\code{type} are the possibilities for \code{wtd.ecdf}.  For \code{wtd.table} the
default \code{type} is \code{"list"}, meaning that the function is to return a
list containing two vectors: \code{x} is the sorted unique values of \code{x}
and \code{sum.of.weights} is the sum of weights for that \code{x}.  This is the
default so that you don't have to convert the \code{names} attribute of the
result that can be obtained with \code{type="table"} to a numeric variable
when \code{x} was originally numeric.  \code{type="table"} for \code{wtd.table}
results in an object that is the same structure as those returned from
\code{table}.  For \code{wtd.loess.noiter} the default \code{type} is \code{"all"},
indicating that the function is to return a list containing all the
original values of \code{x} (including duplicates and without sorting) and
the smoothed \code{y} values corresponding to them.  Set \code{type="ordered
all"} to sort by \code{x}, and \code{type="evaluate"} to evaluate the smooth
only at \code{evaluation} equally spaced points between the observed limits
of \code{x}.
}
\item{y}{a numeric vector the same length as \code{x}}
\item{robust, span, degree, cell, evaluation}{
see \code{loess.smooth}.  The default is linear (\code{degree}=1) and 100 points
to evaluation (if \code{type="evaluate"}).
}}
\value{
\code{wtd.mean} and \code{wtd.var} return scalars.  \code{wtd.quantile} returns a
vector the same length as \code{probs}.  \code{wtd.ecdf} returns a list whose
elements \code{x} and \code{ecdf} correspond to unique sorted values of \code{x}.
If the first CDF estimate is greater than zero, a point (min(x),0) is
placed at the beginning of the estimates.
See above for \code{wtd.table}.  \code{wtd.rank} returns a vector the same
length as \code{x} (after removal of NAs, depending on \code{na.rm}).  See above
for \code{wtd.loess.noiter}.
}
\details{
The functions correctly combine weights of observations having
duplicate values of \code{x} before computing estimates.


\code{wtd.rank} does not handle NAs as elegantly as \code{rank} if \code{weights} is
specified.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
}
\references{
Research Triangle Institute (1995): SUDAAN User's Manual, Release
6.40, pp. 8--16 to 8--17.
}
\seealso{
\code{\link{mean}}, \code{\link{var}}, \code{\link{quantile}}, \code{\link{table}}, \code{\link{rank}}, \code{\link{loess.smooth}}, \code{\link{lowess}},
\code{\link{plsmo}}, \code{\link{ecdf}}, \code{\link{somers2}}, \code{\link{describe}}
}
\examples{
set.seed(1)
x <- runif(500)
wts <- sample(1:6, 500, TRUE)
std.dev <- sqrt(wtd.var(x, wts))
wtd.quantile(x, wts)
death <- sample(0:1, 500, TRUE)
plot(wtd.loess.noiter(x, death, wts, type='evaluate'))
describe(~x, weights=wts)
# describe uses wtd.mean, wtd.quantile, wtd.table
xg <- cut2(x,g=4)
table(xg)
wtd.table(xg, wts, type='table')

# Here is a method for getting stratified weighted means
y <- runif(500)
g <- function(y) wtd.mean(y[,1],y[,2])
summarize(cbind(y, wts), llist(xg), g, stat.name='y')


# Restructure data to generate a dichotomous response variable
# from records containing numbers of events and numbers of trials
num   <- c(10,NA,20,0,15)   # data are 10/12 NA/999 20/20 0/25 15/35
denom <- c(12,999,20,25,35)
w     <- num.denom.setup(num, denom)
w
# attach(my.data.frame[w$subs,])
}
\keyword{nonparametric}
\keyword{category}
\keyword{distribution}
\keyword{robust}
\keyword{loess}
\keyword{smooth}
\keyword{manip}
\concept{weighted sampling}
\concept{grouping}
\concept{weights}

\eof
\name{xYplot}
\alias{xYplot}
\alias{panel.xYplot}
\alias{prepanel.xYplot}
\alias{Dotplot}
\alias{panel.Dotplot}
\alias{prepanel.Dotplot}
\alias{Cbind}
\alias{[.Cbind}
\alias{setTrellis}
\alias{numericScale}
\title{xyplot and dotplot with Matrix Variables to Plot Error Bars and Bands}
\description{
A utility function \code{Cbind} returns the first argument as a vector and
combines all other arguments into a matrix stored as an attribute called
\code{"other"}.  The arguments can be named (e.g.,
\code{Cbind(pressure=y,ylow,yhigh)}) or a \code{label} attribute may be pre-attached
to the first argument. In either case, the name or label of the first
argument is stored as an attribute \code{"label"} of the object returned by
\code{Cbind}.  Storing other vectors as a matrix attribute facilitates plotting
error bars, etc., as \code{trellis} really wants the x- and y-variables to be
vectors, not matrices. If a single argument is given to \code{Cbind} and that
argument is a matrix with column dimnames, the first column is taken as the
main vector and remaining columns are taken as \code{"other"}. A subscript
method for \code{Cbind} objects subscripts the \code{other} matrix along
with the main \code{y} vector.

The \code{xYplot} function is a substitute for \code{xyplot} that allows for
simulated multi-column \code{y}. It uses by default the \code{panel.xYplot} and
\code{prepanel.xYplot} functions to do the actual work. The \code{method} argument
passed to \code{panel.xYplot} from \code{xYplot} allows you to make error bars, the
upper-only or lower-only portions of error bars, alternating lower-only and
upper-only bars, bands, or filled bands.  \code{panel.xYplot} decides how to
alternate upper and lower bars according to whether the median \code{y} value of
the current main data line is above the median \code{y} for all \code{groups} of
lines or not.  If the median is above the overall median, only the upper
bar is drawn. For \code{bands} (but not 'filled bands'), any number of other
columns of \code{y} will be drawn as lines having the same thickness, color, and
type as the main data line.  If plotting bars, bands, or filled bands and
only one additional column is specified for the response variable, that
column is taken as the half width of a precision interval for \code{y}, and the
lower and upper values are computed automatically as \code{y} plus or minus the
value of the additional column variable.


When a \code{groups} variable is present, \code{panel.xYplot} will create a function
in frame 0 called \code{Key} that when invoked will draw a key describing the
\code{groups} labels, point symbols, and colors. By default, the key is outside
the graph.  If \code{Key(locator(1))} is specified, the key will appear so that
its upper left corner is at the coordinates of the mouse click.  For
R/Lattice the first two arguments of \code{Key} (\code{x} and \code{y}) are fractions
of the page, measured from the lower left corner, and the default
placement is at \code{x=0, y=1}.


When \code{method="quantile"} is specified, \code{xYplot} automatically groups the
\code{x} variable into intervals containing a target of \code{nx} observations each,
and within each \code{x} group computes three quantiles of \code{y} and plots these
as three lines. The mean \code{x} within each \code{x} group is taken as the
\code{x}-coordinate. This will make a useful empirical display for large
datasets in which scatterdiagrams are too busy to see patterns of central
tendency and variability.  You can also specify a general function of a
data vector that returns a matrix of statistics for the \code{method} argument.
Arguments can be passed to that function via a list \code{methodArgs}.  The
statistic in the first column should be the measure of central tendency.
Examples of useful \code{method} functions are those listed under the help file
for \code{summary.formula} such as \code{smean.cl.normal}.


\code{Dotplot} is a substitute for \code{dotplot} allowing for a matrix x-variable,
automatic superpositioning when \code{groups} is present, and creation of a
\code{Key} function.  When the x-variable (created by \code{Cbind} to simulate a
matrix) contains a total of 3 columns, the first column specifies where the
dot is positioned, and the last 2 columns specify starting and ending
points for intervals.  The intervals are shown using line type, width, and
color from the trellis \code{plot.line} list. By default, you will usually see a
darker line segment for the low and high values, with the dotted reference
line elsewhere. A good choice of the \code{pch} argument for such plots is \code{3}
(plus sign) if you want to emphasize the interval more than the point
estimate.  When the x-variable contains a total of 5 columns, the 2nd and
5th columns are treated as the 2nd and 3rd are treated above, and the 3rd
and 4th columns define an inner line segment that will have twice the
thickness of the outer segments. In addition, tick marks separate the outer
and inner segments.  This type of display (an example of which appeared in
\emph{The Elements of Graphing Data} by Cleveland) is very suitable for
displaying two confidence levels (e.g., 0.9 and 0.99) or the 0.05, 0.25,
0.75, 0.95 sample quantiles, for example.  For this display, the central
point displays well with a default circle symbol.


\code{setTrellis} sets nice defaults for Trellis graphics, assuming that the
graphics device has already been opened if using postscript, etc. By
default, it sets panel strips to blank and reference dot lines to thickness
1 instead of the Trellis default of 2.


\code{numericScale} is a utility function that facilitates using \code{xYplot} to
plot variables that are not considered to be numeric but which can readily
be converted to numeric using \code{as.numeric()}.  A good example of this is
\code{timeDate} variables in S-Plus 5 and 6. \code{numericScale} converts the
variable into an ordinary numeric variable.  If it is a \code{timeDate}
variable, two attributes are added to the resulting variable:
\code{scales.major} and \code{scales.minor}. These are each lists with elements \code{at}
to specify a vector of numeric values for tick marks, and a corresponding
character vector \code{labels} with formatted values (e.g., using time or date
formats).  When you use such a variable with \code{xYplot} and do not specify a
corresponding \code{scales} element, tick marks and scale labeling are taken
from \code{scales.major}.  The \code{at} element for \code{scales.minor} is used by
\code{panel.xYplot} to add minor tick marks. \code{numericScale} by default will keep
the name of the input variable as a \code{label} attribute for the new numeric
variable.
}
\usage{
Cbind(\dots)

xYplot(formula, data = sys.frame(sys.parent()), groups,
       subset, xlab=NULL, ylab=NULL, ylim=NULL,
       panel=panel.xYplot, prepanel=prepanel.xYplot, scales=NULL,
       minor.ticks=NULL, \dots)

panel.xYplot(x, y, subscripts, groups=NULL, 
             type=if(is.function(method) || method=='quantiles') 
               'b' else 'p',
             method=c("bars", "bands", "upper bars", "lower bars", 
                      "alt bars", "quantiles", "filled bands"), 
             methodArgs=NULL, label.curves=TRUE, abline,
             probs=c(.5,.25,.75), nx,
             cap=0.015, lty.bar=1, 
             lwd=plot.line$lwd, lty=plot.line$lty, pch=plot.symbol$pch, 
             cex=plot.symbol$cex, font=plot.symbol$font, col=NULL, 
             lwd.bands=NULL, lty.bands=NULL, col.bands=NULL, 
             minor.ticks=NULL, col.fill=NULL, \dots)

prepanel.xYplot(x, y, \dots)

Dotplot(formula, data = sys.frame(sys.parent()), groups, subset, 
        xlab = NULL, ylab = NULL, ylim = NULL,
        panel=panel.Dotplot, prepanel=prepanel.Dotplot,
        scales=NULL, \dots)

prepanel.Dotplot(x, y, \dots)

panel.Dotplot(x, y, groups = NULL,
              pch  = dot.symbol$pch, 
              col  = dot.symbol$col, cex = dot.symbol$cex, 
              font = dot.symbol$font, abline, \dots)

setTrellis(strip.blank=TRUE, lty.dot.line=2, lwd.dot.line=1)

numericScale(x, label=NULL, skip.weekends=FALSE, \dots)
}
\arguments{
\item{\dots}{
for \code{Cbind} \code{\dots} is any number of additional numeric
vectors. Unless you are using \code{Dotplot} (which allows for either 2
or 4 "other" variables) or \code{xYplot} with \code{method="bands"},
vectors after the first two are ignored.  If drawing bars and only one
extra variable is given in \code{\dots}, upper and lower values are
computed as described above. If the second argument to \code{Cbind} is a
matrix, that matrix is stored in the \code{"other"} attribute and
arguments after the second are ignored.

Also can be other arguments to pass to \code{labcurve} or \code{Key}.
or extra arguments sent from \code{numericScale} to \code{axis.time}
}
\item{formula}{
a \code{trellis} formula consistent with \code{xyplot} or \code{dotplot} 
}
\item{x}{
\code{x}-axis variable.  For \code{numericScale} \code{x} is any vector
such as \code{as.numeric(x)} returns a numeric vector suitable for x- or
y-coordinates.
}
\item{y}{
  a vector, or an object created by \code{Cbind} for \code{xYplot}.
\code{y} represents the main variable to plot, i.e., the variable used to
draw the main lines. For \code{Dotplot} the first argument to
\code{Cbind} will be the main \code{x}-axis variable.  
}
\item{data}{}
\item{subset}{}
\item{ylim}{}
\item{subscripts}{}
\item{groups}{}
\item{type}{}
\item{scales}{}
\item{panel}{}
\item{prepanel}{}
\item{xlab}{}
\item{ylab}{
  see \code{trellis.args}.  \code{xlab} and \code{ylab} get default values from
  \code{"label"} attributes.
}
\item{method}{
defaults to \code{"bars"} to draw error-bar type plots.  See meaning of other
values above.  \code{method} can be a function.  Specifying \code{method=quantile},
\code{methodArgs=list(probs=c(.5,.25,.75))} is the same as specifying
\code{method="quantile"} without specifying \code{probs}.
}
\item{methodArgs}{
a list containing optional arguments to be passed to the function specified
in \code{method}
}
\item{label.curves}{
set to \code{FALSE} to suppress invocation of \code{labcurve} to label primary curves
where they are most separated or to draw a legend in an empty spot on the
panel.  You can also set \code{label.curves} to a list of options to pass to
\code{labcurve}.  These options can also be passed as \code{\dots} to \code{xYplot}. See the
examples below.
}
\item{abline}{
a list of arguments to pass to \code{panel.abline} for each panel, e.g.
\code{list(a=0, b=1, col=3)} to draw the line of identity using color 3.
}
\item{probs}{
a vector of three quantiles with the quantile corresponding to the central
line listed first. By default \code{probs=c(.5, .25, .75)}. You can also specify
\code{probs} through \code{methodArgs=list(probs=\dots)}.
}
\item{nx}{
number of target observations for each \code{x} group (see \code{cut2} \code{m} argument).
\code{nx} defaults to the minimum of 40 and the number of points in the current
stratum divided by 4. Set \code{nx=FALSE} or \code{nx=0} if \code{x} is already discrete and
requires no grouping.
}
\item{cap}{
the half-width of horizontal end pieces for error bars, as a fraction of
the length of the \code{x}-axis
}
\item{lty.bar}{
line type for bars
}
\item{lwd, lty, pch, cex, font, col}{
see \code{trellis.args}.  These are vectors when \code{groups} is present, and the
order of their elements corresponds to the different \code{groups}, regardless
of how many bands or bars are drawn. If you don't specify \code{lty.bands}, for
example, all band lines within each group will have the same \code{lty}.
}
\item{lty.bands, lwd.bands, col.bands}{
used to allow \code{lty}, \code{lwd}, \code{col} to vary across the different band lines
for different \code{groups}. These parameters are vectors or lists whose
elements correspond to the added band lines (i.e., they ignore the central
line, whose line characteristics are defined by \code{lty}, \code{lwd}, \code{col}). For
example, suppose that 4 lines are drawn in addition to the central line.
Specifying \code{lwd.bands=1:4} will cause line widths of 1:4 to be used for
every group, regardless of the value of \code{lwd}.  To vary characteristics
over the \code{groups} use e.g. \code{lwd.bands=list(rep(1,4), rep(2,4))} or
\code{list(c(1,2,1,2), c(3,4,3,4))}.
}
\item{minor.ticks}{
a list with elements \code{at} and \code{labels} specifying positions and labels for
minor tick marks to be used on the x-axis of each panel, if any. This is
intended for \code{timeDate} variables.
}
\item{col.fill}{
used to override default colors used for the bands in method='filled
bands'. This is a vector when \code{groups} is present, and the order of the
elements corresponds to the different \code{groups}, regardless of how many
bands are drawn.  The default colors for 'filled bands' are pastel colors
matching the default colors superpose.line$col (plot.line$col)
}
\item{strip.blank}{
set to \code{FALSE} to not make the panel strip backgrounds blank 
}
\item{lty.dot.line}{
line type for dot plot reference lines (default = 1 for dotted; use 2 for
dotted)
}
\item{lwd.dot.line}{
line thickness for reference lines for dot plots (default = 1) 
}
\item{label}{
a scalar character string to be used as a variable label after
\code{numericScale} converts the 
variable to numeric form 
}
\item{skip.weekends}{
see \code{axis.time}
}
}
\value{
\code{Cbind} returns a matrix with attributes.  Other functions return standard
\code{trellis} results.
}
\section{Side Effects}{
plots, and \code{panel.xYplot} creates the \code{Key} function in the session frame.
}
\details{
Unlike \code{xyplot}, \code{xYplot} senses the presence of a \code{groups} variable and
automatically invokes \code{panel.superpose} instead of \code{panel.xyplot}. The same
is true for \code{Dotplot} vs. \code{dotplot}.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
\cr
Madeline Bauer
\cr
Department of Infectious Diseases
\cr
University of Southern California School of Medicine
\cr
mbauer@usc.edu
}
\seealso{
\code{\link[lattice]{xyplot}}, \code{\link[lattice]{panel.xyplot}}, \code{\link{summarize}}, \code{\link{label}}, \code{\link{labcurve}},
\code{\link{errbar}}, \code{\link[lattice]{dotplot}}, 
\code{\link{reShape}}, \code{\link{setps}}, \code{\link{cut2}}, \code{\link[lattice]{panel.abline}}
}
\examples{
# Plot 6 smooth functions.  Superpose 3, panel 2.
# Label curves with p=1,2,3 where most separated 
d <- expand.grid(x=seq(0,2*pi,length=150), p=1:3, shift=c(0,pi)) 
xYplot(sin(x+shift)^p ~ x | shift, groups=p, data=d, type='l') 
# Use a key instead, use 3 line widths instead of 3 colors 
# Put key in most empty portion of each panel
xYplot(sin(x+shift)^p ~ x | shift, groups=p, data=d, 
       type='l', keys='lines', lwd=1:3, col=1) 
# Instead of implicitly using labcurve(), put a 
# single key outside of panels (for S-Plus) or at
# lower left corner (for R)
xYplot(sin(x+shift)^p ~ x | shift, groups=p, data=d, 
       type='l', label.curves=FALSE, lwd=1:3, col=1, lty=1:3) 
Key()       # S-Plus
Key(0,.1)   # R


# Show the median and quartiles of height given age, stratified 
# by sex and race.  Draws 2 sets (male, female) of 3 lines per panel.
# xYplot(height ~ age | race, groups=sex, method='quantiles')


# Examples of plotting raw data
dfr <- expand.grid(month=1:12, continent=c('Europe','USA'), 
                   sex=c('female','male'))
set.seed(1)
dfr <- upData(dfr,
              y=month/10 + 1*(sex=='female') + 2*(continent=='Europe') + 
                runif(48,-.15,.15),
              lower=y - runif(48,.05,.15),
              upper=y + runif(48,.05,.15))


xYplot(Cbind(y,lower,upper) ~ month,subset=sex=='male' & continent=='USA',
       data=dfr)
xYplot(Cbind(y,lower,upper) ~ month|continent, subset=sex=='male',data=dfr)
xYplot(Cbind(y,lower,upper) ~ month|continent, groups=sex, data=dfr); Key() 
# add ,label.curves=FALSE to suppress use of labcurve to label curves where
# farthest apart


xYplot(Cbind(y,lower,upper) ~ month,groups=sex,
                              subset=continent=='Europe', data=dfr) 
xYplot(Cbind(y,lower,upper) ~ month,groups=sex, type='b',
                              subset=continent=='Europe', keys='lines',
                              data=dfr)
# keys='lines' causes labcurve to draw a legend where the panel is most empty


xYplot(Cbind(y,lower,upper) ~ month,groups=sex, type='b', data=dfr,
                              subset=continent=='Europe',method='bands') 
xYplot(Cbind(y,lower,upper) ~ month,groups=sex, type='b', data=dfr,
                              subset=continent=='Europe',method='upper')


label(dfr$y) <- 'Quality of Life Score'   
# label is in Hmisc library = attr(y,'label') <- 'Quality\dots'; will be
# y-axis label 
# can also specify Cbind('Quality of Life Score'=y,lower,upper) 
xYplot(Cbind(y,lower,upper) ~ month, groups=sex,
       subset=continent=='Europe', method='alt bars',
        offset=if(.R.)unit(.1,'inches') else .4, type='b', data=dfr)   
# offset passed to labcurve to label .4 y units away from curve
# for R (using grid/lattice), offset is specified using the grid
# unit function, e.g., offset=unit(.4,'native') or
# offset=unit(.1,'inches') or unit(.05,'npc')


# The following example uses the summarize function in Hmisc to 
# compute the median and outer quartiles.  The outer quartiles are 
# displayed using "error bars"
set.seed(111)
dfr <- expand.grid(month=1:12, year=c(1997,1998), reps=1:100)
month <- dfr$month; year <- dfr$year
y <- abs(month-6.5) + 2*runif(length(month)) + year-1997
s <- summarize(y, llist(month,year), smedian.hilow, conf.int=.5) 
xYplot(Cbind(y,Lower,Upper) ~ month, groups=year, data=s, 
       keys='lines', method='alt', type='b')
# Can also do:
s <- summarize(y, llist(month,year), quantile, probs=c(.5,.25,.75),
               stat.name=c('y','Q1','Q3')) 
xYplot(Cbind(y, Q1, Q3) ~ month, groups=year, data=s, 
       type='b', keys='lines') 
# Or:
xYplot(y ~ month, groups=year, keys='lines', nx=FALSE, method='quantile',
       type='b') 
# nx=FALSE means to treat month as a discrete variable


# To display means and bootstrapped nonparametric confidence intervals 
# use:
s <- summarize(y, llist(month,year), smean.cl.boot) 
s
xYplot(Cbind(y, Lower, Upper) ~ month | year, data=s, type='b')
# Can also use Y <- cbind(y, Lower, Upper); xYplot(Cbind(Y) ~ ...) 
# Or:
xYplot(y ~ month | year, nx=FALSE, method=smean.cl.boot, type='b')


# This example uses the summarize function in Hmisc to 
# compute the median and outer quartiles.  The outer quartiles are 
# displayed using "filled bands"


s <- summarize(y, llist(month,year), smedian.hilow, conf.int=.5) 


# filled bands: default fill = pastel colors matching solid colors
# in superpose.line (this works differently in R)
xYplot ( Cbind ( y, Lower, Upper ) ~ month, groups=year, 
     method="filled bands" , data=s, type="l")


# note colors based on levels of selected subgroups, not first two colors
xYplot ( Cbind ( y, Lower, Upper ) ~ month, groups=year, 
     method="filled bands" , data=s, type="l",
     subset=(year == 1998 | year == 2000), label.curves=FALSE )


# filled bands using black lines with selected solid colors for fill
xYplot ( Cbind ( y, Lower, Upper ) ~ month, groups=year, 
     method="filled bands" , data=s, label.curves=FALSE,
     type="l", col=1, col.fill = 2:3)
Key(.35,1,col = 2:3) #use fill colors in key


# A good way to check for stable variance of residuals from ols 
# xYplot(resid(fit) ~ fitted(fit), method=smean.sdl) 
# smean.sdl is defined with summary.formula in Hmisc


# Plot y vs. a timeDate variable x
# xYplot(y ~ numericScale(x, label='Label for X') | country) 
# For this example could omit label= and specify 
#    y ~ numericScale(x) | country, xlab='Label for X'


# Here is an example of using xYplot with several options
# to change various Trellis parameters,
# xYplot(y ~ x | z, groups=v, pch=c('1','2','3'),
#        layout=c(3,1),     # 3 panels side by side
#        ylab='Y Label', xlab='X Label',
#        main=list('Main Title', cex=1.5),
#        par.strip.text=list(cex=1.2),
#        strip=function(\dots) strip.default(\dots, style=1),
#        scales=list(alternating=FALSE))


#
# Dotplot examples
#


s <- summarize(y, llist(month,year), smedian.hilow, conf.int=.5) 


setTrellis()            # blank conditioning panel backgrounds 
Dotplot(month ~ Cbind(y, Lower, Upper) | year, data=s) 
# or Cbind(\dots), groups=year, data=s


# Display a 5-number (5-quantile) summary (2 intervals, dot=median) 
# Note that summarize produces a matrix for y, and Cbind(y) trusts the 
# first column to be the point estimate (here the median) 
s <- summarize(y, llist(month,year), quantile,
               probs=c(.5,.05,.25,.75,.95), type='matrix') 
Dotplot(month ~ Cbind(y) | year, data=s) 
# Use factor(year) to make actual years appear in conditioning title strips


# Dotplot(z ~ x | g1*g2)                 
# 2-way conditioning 
# Dotplot(z ~ x | g1, groups=g2); Key()  
# Key defines symbols for g2


# If the data are organized so that the mean, lower, and upper 
# confidence limits are in separate records, the Hmisc reShape 
# function is useful for assembling these 3 values as 3 variables 
# a single observation, e.g., assuming type has values such as 
# c('Mean','Lower','Upper'):
# a <- reShape(y, id=month, colvar=type) 
# This will make a matrix with 3 columns named Mean Lower Upper 
# and with 1/3 as many rows as the original data 
}
\keyword{hplot}
\concept{trellis}
\concept{lattice}

\eof
\name{xy.group}
\alias{xy.group}
\title{
Mean x vs. function of y in groups of x
}
\description{
Compute mean x vs. a function of y (e.g. median) by quantile
groups of x or by x grouped to have a given average number of
observations.  Deletes NAs in x and y before doing computations.
}
\usage{
xy.group(x, y, m=150, g, fun=mean, result="list")
}
\arguments{
\item{x}{
a vector, may contain NAs
}
\item{y}{
a vector of same length as x, may contain NAs
}
\item{m}{
number of observations per group
}
\item{g}{
number of quantile groups
}
\item{fun}{
function of y such as median or mean (the default)
}
\item{result}{
"list" (the default), or "matrix"
}}
\value{
if result="list", a list with components x and y suitable for plotting.
if result="matrix", matrix with rows corresponding to x-groups and columns named
n, x, and y.
}
\seealso{
\code{\link{cut2}}, \code{\link{tapply}}
}
\examples{
# plot(xy.group(x, y, g=10))	#Plot mean y by deciles of x
# xy.group(x, y, m=100, result="matrix")	#Print table, 100 obs/group
}
\keyword{category}
\keyword{nonparametric}
\concept{grouping}
\concept{stratification}
\concept{aggregation}
\concept{discretization}


\eof
\name{sas.get}
\alias{sas.get}
\alias{is.special.miss}
\alias{[.special.miss}
\alias{print.special.miss}
\alias{format.special.miss}
\alias{sas.codes}
\alias{code.levels}
\title{Convert a SAS Dataset to an S Data Frame}
\description{
Converts a SAS dataset into an S data frame.  
You may choose to extract only a subset of variables 
or a subset of observations in the SAS dataset.
The function will automatically convert PROC FORMAT-coded
variables to factor objects.  The original SAS codes are stored in an
attribute called \code{sas.codes} and these may be added back to the
\code{levels} of a \code{factor} variable using the \code{code.levels}
function. 
Information about special missing values may be captured in an attribute
of each variable having special missing values.  This attribute is
called \code{special.miss}, and such variables are given class \code{special.miss}.
There are \code{print}, \code{[]}, \code{format}, and \code{is.special.miss}
methods for such variables.
The \code{chron} function is used to set up date, time, and date-time
variables.
If using S-Plus 5 or 6 or later, the \code{timeDate} function is used instead.
If a date variable represents a partial date (.5 added if
month missing, .25 added if day missing, .75 if both), an attribute
\code{partial.date} is added to the variable, and the variable also becomes
a class \code{imputed} variable.
The \code{describe} function uses information about partial dates and
special missing values.
There is an option to automatically \code{PKUNZIP} compressed
SAS datasets.

\code{sas.get} works by composing and running a SAS job that
creates various ASCII files that are read and analyzed
by \code{sas.get}.  You can also run the SAS \code{sas_get} macro,
which writes the ASCII files for downloading, in a separate
step or on another computer, and then tell \code{sas.get} (through the
\code{sasout} argument) to access these files instead of running SAS.
}
\usage{
sas.get(library, member, variables=character(0), ifs=character(0),
     format.library=library, id,
     dates.=c("sas","yymmdd","yearfrac","yearfrac2"),
     keep.log=TRUE, log.file="_temp_.log", macro=sas.get.macro,
     data.frame.out=existsFunction("data.frame"), clean.up=!.R., quiet=FALSE,
     temp=tempfile("SaS"), formats=TRUE, 
     recode=formats, special.miss=FALSE, sasprog="sas",
     as.is=.5, check.unique.id=TRUE, force.single=FALSE, where,
     uncompress=FALSE)

is.special.miss(x, code)

x[\dots]

\method{print}{special.miss}(x, ...)

\method{format}{special.miss}(x, ...)

sas.codes(object)

code.levels(object)
}
\arguments{
\item{library}{
character string naming the directory in which the dataset is kept.
The default is \code{library="."}, indicating that the current
directory is to be used.
}
\item{member}{
character string giving the second part of the two part SAS dataset name.  
(The first part is irrelevant here - it is mapped to the directory name.)
}
\item{x}{
a variable that may have been created by \code{sas.get} with \code{special.miss=T}
or with \code{recode} in effect.
}
\item{variables}{
vector of character strings naming the variables in the SAS dataset.  
The resulting data frame will contain only those variables from the
SAS dataset.  
To get all of the variables (the default), an empty string may be given.
It is a fatal error if any one of the variables is not
in the SAS dataset.  If you have retrieved a subset of the variables
in the SAS dataset and which to retrieve the same list of variables
from another dataset, you can program the value of \code{variables} - see
one of the last examples.
}
\item{ifs}{
a vector of character strings, each containing one SAS "subsetting if"
statement.  
These will be used to extract a subset of the observations in the SAS dataset.
}
\item{format.library}{
The directory containing the file \bold{formats.sc2}, which contains
the definitions of the user defined formats used in this dataset.
By default, we look for the formats in the same directory as the data.
The user defined formats must be available (so SAS can read the data).
}
\item{formats}{
Set \code{formats} to \code{FALSE} to keep \code{sas.get} from telling the SAS macro to 
retrieve value label formats from \code{format.library}.  When you do not
specify \code{formats} or \code{recode}, \code{sas.get} will set \code{format} to \code{T} if a
SAS format catalog (\code{.sct} or \code{.sc2}) file exists in \code{format.library}.
\code{sas.get} stores SAS PROC FORMAT VALUE definitions
as the \code{formats} attribute of the returned
object (see below). A format is used if it is referred to by one or more 
variables
in the dataset, if it contains no ranges of values (i.e., it identifies
value labels for single values), and if it is a character format
or a numeric format that is not used just to label missing values.
To fetch the values and labels for variable \code{x} in the dataset \code{d} you
could type:
\cr
    f <- attr(d\$x, "format")
\cr
    formats <- attr(d, "formats")
\cr
    formats\$f\$values; formats\$f\$labels
}
\item{recode}{
This parameter defaults to \code{T} if \code{formats} is \code{T}.  If it is
\code{T}, variables that have an appropriate format (see above) are
recoded as \code{factor} objects, which map the values
to the value labels for the format.  Alternatively, set \code{recode} to
1 to use labels of the form value:label, e.g. 1:good 2:better 3:best.
Set \code{recode} to 2 to use labels such as good(1) better(2) best(3).
Since \code{sas.codes} and \code{code.levels} add flexibility, the usual choice
for \code{recode} is \code{T}.
}
\item{special.miss}{
For numeric variables, any missing values are stored as NA in S.
You can recover special missing values by setting \code{special.miss} to
\code{T}.  This will cause the \code{special.miss} attribute and the
\code{special.miss} class to be added
to each variable that has at least one special missing value.  
Suppose that variable  \code{y} was .E in observation 3 and .G
in observation 544.  The \code{special.miss} attribute for \code{y} then has the
value
\cr
    list(codes=c("E","G"),obs=c(3,544))
\cr
To fetch this information for variable \code{y} you would say for example
\cr
    s <- attr(y, "special.miss")
\cr
    s\$codes; s\$obs
\cr
or use \code{is.special.miss(x)} or the \code{print.special.miss} method, which
will replace \code{NA} values for the variable with \code{E} or \code{G} if they
correspond to special missing values.
The describe
function uses this information in printing a data summary.  
}
\item{id}{
The name of the variable to be used as the row names of the S dataset.
The id variable becomes the \code{row.names} attribute of a data frame, but
the id variable is still retained as a variable in the data frame.
You can also specify a vector of variable names as the \code{id}
parameter.  After fetching the data from SAS, all these variables will be
converted to character format and concatenated (with a space as a separator)
to form a (hopefully) unique ID variable.
}
\item{dates.}{specifies the format for storing SAS dates in the
  resulting data frame}
\item{as.is}{
SAS character variables are converted to S factor
objects if \code{as.is=FALSE} or if \code{as.is} is a number between 0 and 1 inclusive and
the number of unique values of the variable is less than
the number of observations (\code{n}) times \code{as.is}.  The default if \code{as.is} is .5,
so character variables are converted to factors only if they have fewer
than \code{n/2} unique values.  The primary purpose of this is to keep unique
identification variables as character values in the data frame instead
of using more space to store both the integer factor codes and the
factor labels.
}
\item{check.unique.id}{
If \code{id} is specified, the row names are checked for
uniqueness if \code{check.unique.id=T}.  If any are duplicated, a warning
is printed.  Note that if a data frame is being created with duplicate
row names, statements such as \code{my.data.frame["B23",]} will retrieve
only the first row with a row name of \code{"B23"}.
}
\item{force.single}{
By default, SAS numeric variables having \code{LENGTH}s > 4 are stored as
S double precision numerics, which allow for the same precision as
a SAS \code{LENGTH} 8 variable.  Set \code{force.single=T} to store every
numeric variable in single precision (7 digits of precision).
This option is useful when the creator of the SAS dataset has
failed to use a \code{LENGTH} statement.
R does not have single precision,
so no attempt is made to convert to single if running R.
}
\item{keep.log}{
logical flag: if \code{FALSE}, delete the SAS log file upon completion.
}
\item{log.file}{
the name of the SAS log file.
}
\item{macro}{
the name of an S object in the current search path that contains the text of
the SAS macro called by S. The S object is a character vector that
can be edited using, for example, sas.get.macro <- editor(sas.get.macro).
}
\item{data.frame.out}{
  set to \code{FALSE} to make the result a list instead of a data frame}
\item{clean.up}{
logical flag: if \code{TRUE}, remove all temporary files when finished.  You
may want to keep these while debugging the SAS macro.  Not needed for \R.
}
\item{quiet}{logical flag: if \code{FALSE}, print the contents of the
  SAS log file if there has been an error.
}
\item{temp}{
  the prefix to use for the temporary files.  Two characters
  will be added to this, the resulting name
  must fit on your file system.
}
\item{sasprog}{
the name of the system command to invoke SAS
}
\item{uncompress}{
set to \code{FALSE} by default.  Set it
to \code{T} to automatically invoke the DOS \code{PKUNZIP} command
if \code{member.zip} exists,
to uncompress the SAS dataset before
proceeding.  This assumes you have the file permissions to allow
uncompressing in place.  If the file is already uncompressed, this
option is ignored.
}
\item{where}{
by default, a list or data frame which contains all the variables
is returned.  If you specify \code{where}, each individual variable
is placed into a separate object (whose name is the name
of the variable) using the \code{assign} function with the
\code{where} argument.  For example, you can put each variable
in its own file in a directory, which in some cases may
save memory over attaching a data frame.
}
\item{code}{
a special missing value code (A through Z or underscore) to check against.
If \code{code} is omitted, \code{is.special.miss} will return a \code{T} for each
observation that has any special missing value.
}
\item{object}{a variable in a data frame created by \code{sas.get}}
\item{\dots}{ignored}
}
\value{
A data frame resembling the SAS dataset.  If \code{id}
was specified, that column of the data frame will be used
as the row names of the data frame.  Each variable in the data frame
or vector in the list will have the attributes \code{label} and \code{format}
containing SAS labels and formats.  Underscores in formats are
converted to periods.  Formats for character variables have \code{\$} placed
in front of their names.
If \code{formats} is \code{T} and there are any 
appropriate format definitions in \code{format.library}, the returned
object will have attribute \code{formats} containing lists named the
same as the format names (with periods substituted for underscores and
character formats prefixed by \$).
Each of these lists has a vector called \code{values} and one called
\code{labels} with the PROC FORMAT; VALUE \dots   definitions.
}
\section{Side Effects}{
if a SAS error occurs the SAS log file will be
printed under the control of the \code{pager} function.
}
\details{
If you specify \code{special.miss=T} and there are no special missing
values in the data SAS dataset, the SAS step will bomb.

For variables having a \code{PROC FORMAT VALUE}
format with some of the levels undefined, \code{sas.get} will interpret those
values as \code{NA} if you are using \code{recode}.


If you leave the \code{sasprog} argument at its default value of
\code{"sas"}, be sure that the SAS executable is in the \code{PATH}
specified in your \code{autoexec.bat} file.  Also make sure that
you invoke S so that your current project directory is known
to be the current working directory.  This is best done by creating
a shortcut in Windows95, for which the command to execute will be
something like \code{drive:\\spluswin\\cmd\\splus.exe HOME=.} and the
program is flagged to start in \code{drive:\\myproject} for example.
In this way, you will be able to examine the SAS log file easily
since it will be placed in \code{drive:\\myproject} by default.

SAS will create \code{SASWORK} and \code{SASUSER} directories in what it thinks
are the current working directories.  To specify where SAS should
put these instead, edit the \code{config.sas} file or specify a
\code{sasprog} argument of the following form:
\code{sasprog="\\sas\\sas.exe -saswork c:\\saswork -sasuser c:\\sasuser"}.

When \code{sas.get} needs to run SAS it is run in iconized form.

The SAS macro \code{sas_get} uses record lengths of up to 4096 in two
places.  If you are exporting records that are very long (because of
a large number of variables and/or long character variables), you
may want to edit these \code{LRECL}s to quadruple them, for example.
}
\note{
If \code{sasout} is not given, you
must be able to run SAS on your system.  


If you are reading time or
date-time variables, you will need to execute the command \code{library(chron)}
to print those variables or the data frame if the \code{timeDate} function
is not available.
}
\section{BACKGROUND}{
The references cited below explain the structure of SAS datasets and how
they are stored.
See \emph{SAS Language} 
for a discussion of the "subsetting if" statement.
}
\author{
Terry Therneau, Mayo Clinic
\cr
Frank Harrell, University of Virginia
\cr
Bill Dunlap, University of Washington and Insightful Corp.
\cr
Michael W. Kattan, Baylor University
}
\references{
SAS Institute Inc. (1990).
\emph{SAS Language: Reference, Version 6.}
First Edition.
SAS Institute Inc., Cary, North Carolina.


SAS Institute Inc. (1988).
SAS Technical Report P-176,
\emph{Using the SAS System, Release 6.03, under UNIX Operating Systems and Derivatives.  }
SAS Institute Inc., Cary, North Carolina.


SAS Institute Inc. (1985).
\emph{SAS Introductory Guide.}
Third Edition.
SAS Institute Inc., Cary, North Carolina.
}
\seealso{
\code{\link{data.frame}}, \code{\link{describe}},
\code{\link{label}}, \code{\link{upData}}
}
\examples{
\dontrun{
mice <- sas.get("saslib", mem="mice", var=c("dose", "strain", "ld50"))
plot(mice$dose, mice$ld50)

nude.mice <- sas.get(lib=unix("echo $HOME/saslib"), mem="mice",
	ifs="if strain='nude'")

nude.mice.dl <- sas.get(lib=unix("echo $HOME/saslib"), mem="mice",
	var=c("dose", "ld50"), ifs="if strain='nude'")

# Get a dataset from current directory, recode PROC FORMAT; VALUE \dots 
# variables into factors with labels of the form "good(1)" "better(2)",
# get special missing values, recode missing codes .D and .R into new
# factor levels "Don't know" and "Refused to answer" for variable q1
d <- sas.get(mem="mydata", recode=2, special.miss=TRUE)
attach(d)
nl <- length(levels(q1))
lev <- c(levels(q1), "Don't know", "Refused")
q1.new <- as.integer(q1)
q1.new[is.special.miss(q1,"D")] <- nl+1
q1.new[is.special.miss(q1,"R")] <- nl+2
q1.new <- factor(q1.new, 1:(nl+2), lev)
# Note: would like to use factor() in place of as.integer \dots but
# factor in this case adds "NA" as a category level

d <- sas.get(mem="mydata")
sas.codes(d$x)    # for PROC FORMATted variables returns original data codes
d$x <- code.levels(d$x)   # or attach(d); x <- code.levels(x)
# This makes levels such as "good" "better" "best" into e.g.
# "1:good" "2:better" "3:best", if the original SAS values were 1,2,3

# For the following example, suppose that SAS is run on a
# different machine from the one on which S is run.
# The sas_get macro is used to create files needed by
# sas.get.  To make a text file containing the sas_get macro
# run the following S command, for example:
#   cat(sas.get.macro, file='/sasmacro/sas_get.sas', sep='\n')

# Here is the SAS job.  This job assumes that you put
# sas_get.sas in an autocall macro library.


#  libname db '/my/sasdata/area';
#  \%sas_get(db.mydata, dict, data, formats, specmiss,
#           formats=1, specmiss=1)


# Substitute whatever file names you may want.
# Next the 4 files are moved to the S machine (using
# ASCII file transfer mode) and the following S
# program is run:


mydata <- sas.get(sasout=c('dict','data','formats','specmiss'),
                  id='idvar')


# If PKZIP is run after \%sas_get, e.g. "PKZIP port dict data formats"
# (assuming that specmiss was not used here), use


mydata <- sas.get(sasout='a:port', id='idvar')


# which will run PKUNZIP port to unzip a:port.zip, creating the
# dict, data, and formats files which are generated (and later
# deleted) by sas.get


# Retrieve the same variables from another dataset (or an update of
# the original dataset)
mydata2 <- sas.get('mydata2', var=names(mydata))
# This only works if none of the original SAS variable names contained _

# Code from Don MacQueen to generate SAS dataset to test import of
# date, time, date-time variables
# data ssd.test;
#     d1='3mar2002'd ;
#     dt1='3mar2002 9:31:02'dt;
#     t1='11:13:45't;
#     output;
#
#     d1='3jun2002'd ;
#     dt1='3jun2002 9:42:07'dt;
#     t1='11:14:13't;
#     output;
#     format d1 mmddyy10. dt1 datetime. t1 time.;
# run;
}
}
\keyword{interface}
\keyword{manip}
% Converted by Sd2Rd version 1.21.







\eof
