\name{Design-internal}
\title{Internal Design functions}
\alias{addOffset4ModelFrame}
\alias{as.character.Surv}
\alias{axisf}
\alias{bj.fit2}
\alias{coxphFit}
\alias{cox.zph}
\alias{des.args}
\alias{[.Design}
\alias{DesignAssign}
\alias{Design.levels}
\alias{formula.Design}
\alias{getOldDesign}
\alias{gparms}
\alias{is.na.Surv}
\alias{is.Surv}
\alias{lm.pfit}
\alias{lrm.fit.strat}
\alias{Math.Surv}
\alias{oldDesignFit2R}
\alias{ols.influence}
\alias{Ops.Surv}
\alias{print.summary.survreg2}
\alias{set.atr}
\alias{summary.glmD}
\alias{Summary.Surv}
\alias{Surv}
\alias{[.Surv}
\alias{survfit.cph.null}
\alias{survreg.auxinfo}
\alias{survreg.fit2}
\alias{val.probg}
\alias{value.chk}
\alias{.R.}
\alias{.SV4.}
\alias{.newSurvival.}
\description{Internal Design functions.}
\details{These are not to be called by the user or are undocumented.}
\keyword{internal}

\eof
\name{Design.Misc}
\alias{Design.Misc}
\alias{Varcov.cph}
\alias{Varcov.glmD}
\alias{Varcov.glsD}
\alias{Varcov.lrm}
\alias{Varcov.ols}
\alias{Varcov.psm}
\alias{oos.loglik}
\alias{oos.loglik.ols}
\alias{oos.loglik.lrm}
\alias{oos.loglik.cph}
\alias{oos.loglik.psm}
\alias{oos.loglik.glmD}
\alias{num.intercepts}
\alias{Getlim}
\alias{Getlimi}
\alias{related.predictors}
\alias{interactions.containing}
\alias{param.order}
\alias{Penalty.matrix}
\alias{Penalty.setup}
\alias{lrtest}
\alias{univarLR}
\alias{Newlabels}
\alias{Newlevels}
\alias{Newlabels.Design}
\alias{Newlevels.Design}
\alias{DesignFit}
\alias{print.Design}
\alias{residuals.Design}
\alias{print.lrtest}
\title{Miscellaneous Design Attributes and Utility Functions}
\description{
These functions are used internally to \code{anova.Design},
\code{fastbw}, etc., to retrieve various attributes of a design.  These
functions allow some fitting functions not in the \code{Design} series
(e.g,, \code{lm}, \code{glm}) to be used with \code{anova.Design},
\code{fastbw}, and similar functions. 


For \code{Varcov}, there are these functions: \code{Varcov.default},
\code{Varcov.lm}, \code{Varcov.glm}.  The \code{oos.loglik} function for
each type of model implemented computes the -2 log likelihood for
out-of-sample data (i.e., data not necessarily used to fit the model)
evaluated at the parameter estimates from a model fit.  Vectors for the
model's linear predictors and response variable must be given.
\code{oos.loglik} is used primarily by \code{bootcov}.


The \code{Getlim} function retrieves distribution summaries
from the fit or from a \code{datadist} object.  It handles getting summaries
from both sources to fill in characteristics for variables that were not
defined during the model fit.  \code{Getlimi} returns the summary
for an individual model variable.  


The \code{related.predictors} function
returns a list containing variable numbers that are directly or
indirectly related to each predictor.  The \code{interactions.containing}
function returns indexes of interaction effects containing a given
predictor.  The \code{param.order} function returns a vector of logical
indicators for whether parameters are associated with certain types of
effects (nonlinear, interaction, nonlinear interaction).


The \code{Penalty.matrix} function builds a default penalty matrix for
non-intercept term(s) for use in penalized maximum likelihood
estimation.  The \code{Penalty.setup} function takes a constant or list
describing penalty factors for each type of term in the model and
generates the proper vector of penalty multipliers for the current model.


The \code{lrtest} function does likelihood ratio tests for
two nested models, from fits that have \code{stats} components with
\code{"Model L.R."} values.  For models such as \code{psm, survreg, ols, lm} which have
scale parameters, it is assumed that scale parameter for the smaller model
is fixed at the estimate from the larger model (see the example).

\code{univarLR} takes a multivariable model fit object from
\code{Design} and re-fits a sequence of models containing one predictor
at a time.  It prints a table of likelihood ratio \eqn{chi^2} statistics
from these fits.

The \code{Newlabels} function is used to override the variable labels in a
fit object.  Likewise, \code{Newlevels} can be used to create a new fit object
with levels of categorical predictors changed.  These two functions are
 especially useful when constructing nomograms.


\code{DesignFit} is used to convert a fit from non-Design functions (e.g.,
\code{glm}) that were invoked with Design in effect to Design functions so
that \code{anova.Design} will be called by \code{anova()}, etc.  So that the
original fit's \code{residuals} and \code{print} methods, if they exist, will be
called, there are functions \code{print.Design} and \code{residuals.Design} to
dispatch them.  These two functions are not needed in versions of
S-Plus prior to 5.x (i.e., non-SV4).
}
\usage{
\method{Varcov}{cph}(object, regcoef.only=FALSE, \dots)
\method{Varcov}{glmD}(object, regcoef.only=FALSE, \dots)
\method{Varcov}{glsD}(object, regcoef.only=FALSE, \dots)
\method{Varcov}{lrm}(object, regcoef.only=FALSE, \dots)
\method{Varcov}{ols}(object, regcoef.only=FALSE, \dots)
\method{Varcov}{psm}(object, regcoef.only=FALSE, \dots)

oos.loglik(fit, \dots)

\method{oos.loglik}{ols}(fit, lp, y, \dots)
\method{oos.loglik}{lrm}(fit, lp, y, \dots)
\method{oos.loglik}{cph}(fit, lp, y, \dots)
\method{oos.loglik}{psm}(fit, lp, y, \dots)
\method{oos.loglik}{glmD}(fit, lp, y, \dots)

num.intercepts(fit)

Getlim(at, allow.null=FALSE, need.all=TRUE)
Getlimi(name, Limval, need.all=TRUE)

related.predictors(at, type=c("all","direct"))
interactions.containing(at, pred)
param.order(at, term.order)

Penalty.matrix(at, X)
Penalty.setup(at, penalty)

lrtest(fit1, fit2)
\method{print}{lrtest}(x, \dots)

univarLR(fit)

Newlabels(fit, \dots)
Newlevels(fit, \dots)
\method{Newlabels}{Design}(fit, labels, \dots)
\method{Newlevels}{Design}(fit, levels, \dots)

DesignFit(fit)  # fit from glm, lm, etc.,then use anova etc. on result
}
\arguments{
\item{fit}{result of a fitting function}
\item{object}{result of a fitting function}
\item{at}{
\code{Design} element of a fit
}
\item{pred}{
index of a predictor variable (main effect)
}
\item{fit1}{
}
\item{fit2}{
fit objects from \code{lrm,ols,psm,cph} etc.  It doesn't matter which
fit object is the sub-model.
}
\item{regcoef.only}{
for fits such as parametric survival models which have a final row and
column of the covariance matrix for a non-regression parameter such
as a log(scale) parameter, setting \code{regcoef.only=TRUE} causes only the first
\code{p} rows and columns of the covariance matrix to be returned, where
\code{p} is the length of \code{object$coef}.
}
\item{lp}{
linear predictor vector for \code{oos.loglik}.  For proportional odds
ordinal logistic models, this should have used the first intercept
only.  If \code{lp} and \code{y} are omitted, the -2 log likelihood for the
original fit are returned.
}
\item{y}{
values of a new vector of responses passed to \code{oos.loglik}.
}
\item{name}{
the name of a variable in the model
}
\item{Limval}{
an object returned by \code{Getlim}
}
\item{allow.null}{
prevents \code{Getlim} from issuing an error message if no limits are found
in the fit or in the object pointed to by \code{options(datadist=)}
}
\item{need.all}{
set to \code{FALSE} to prevent \code{Getlim} or \code{Getlimi} from issuing an error message
if data for a variable are not found
}
\item{type}{
set to \code{"direct"} to return lists of indexes of directly related
factors only (those in interactions with the predictor)
}
\item{term.order}{
1 for all parameters, 2 for all parameters associated with either nonlinear
or interaction effects, 3 for nonlinear effects (main or interaction),
4 for interaction effects, 5 for nonlinear interaction effects.
}
\item{X}{
a design matrix, not including columns for intercepts
}
\item{penalty}{
a vector or list specifying penalty multipliers for types of model terms
}
\item{x}{a result of \code{lrtest}}
\item{labels}{
a character vector specifying new labels for variables in a fit.
To give new labels for all variables, you can specify \code{labels} of the
form \code{labels=c("Age in Years","Cholesterol")}, where the list of new labels is
assumed to be the length of all main effect-type variables in the fit and
in their original order in the model formula.  You may specify a named
vector to give new labels in random order or for a subset of the 
variables, e.g., \code{labels=c(age="Age in Years",chol="Cholesterol")}.
}
\item{levels}{
a list of named vectors specifying new level labels for categorical
predictors.  This will override \code{parms} as well as \code{datadist} information
(if available) that were stored with the fit.  
}
\item{\dots}{other arguments; for \code{Varcov} the first argument is
  the fit object}
}
\value{
\code{Varcov} returns a variance-covariance matrix, and \code{num.intercepts}
returns an integer with the number of intercepts in the model.
\code{oos.loglik} returns a scalar -2 log likelihood value.
\code{Getlim} returns a list with components \code{limits} and \code{values}, either
stored in \code{fit} or retrieved from the object created by \code{datadist} and
pointed to in \code{options(datadist=)}.
\code{related.predictors} returns a list of vectors, and \code{interactions.containing}
returns a vector.  \code{param.order} returns a logical vector corresponding
to non-strata terms in the model.
\code{Penalty.matrix} returns a symmetric matrix with dimension equal to the
number of slopes in the model.  For all but categorical predictor main
effect elements, the matrix is diagonal with values equal to the variances
of the columns of \code{X}.  For segments corresponding to \code{c-1} dummy variables
for \code{c}-category predictors,  puts a \code{c-1} x \code{c-1} sub-matrix in
\code{Penalty.matrix} that is constructed so that a quadratic form with 
\code{Penalty.matrix} in the middle computes the sum of squared differences
in parameter values about the mean, including a portion for the reference
cell in which the parameter is by definition zero.
\code{Newlabels} returns a new fit object with the labels adjusted.
\code{DesignFit} returns the original object but with \code{oldClass} of
\code{"Design"} and with a new attribute \code{"fitFunction"} containing the
original vector of classes.
}
\seealso{
\code{\link{Design}}, \code{\link{fastbw}}, \code{\link{anova.Design}}, \code{\link{summary.lm}}, \code{\link{summary.glm}}, \code{\link{datadist}}, \code{\link{vif}}, \code{\link{bootcov}}
}
\examples{
\dontrun{
f <- psm(S ~ x1 + x2 + sex + race, dist='gau')
g <- psm(S ~ x1 + sex + race, dist='gau', 
         fixed=list(scale=exp(f$parms)))
lrtest(f, g)


g <- Newlabels(f, c(x2='Label for x2'))
g <- Newlevels(g, list(sex=c('Male','Female'),race=c('B','W')))
nomogram(g)
}
}
\keyword{models}
\keyword{methods}
% Converted by Sd2Rd version 1.21.







\eof
\name{Design}
\alias{Design}
\title{
Design Methods and Generic Functions
}
\description{
This is a series of special transformation functions (\code{asis}, \code{pol},
\code{lsp}, \code{rcs}, \code{catg}, \code{scored}, \code{strat}, \code{matrx}), fitting functions
(e.g.,  \code{lrm},\code{cph}, \code{psm}, or \code{ols}), and generic analysis functions
(\code{anova.Design}, \code{summary.Design}, \code{predict.Design}, 
\code{plot.Design}, \code{survplot},
\code{fastbw}, \code{validate}, \code{calibrate}, \code{specs.Design},
\code{which.influence}, \code{latex.Design}, \code{nomogram.Design}, \code{datadist}, \code{gendata}) 
that help automate many
analysis steps, e.g. fitting restricted interactions and multiple
stratification variables, analysis of variance (with tests of linearity
of each factor and pooled tests), plotting effects of variables in the
model, estimating and graphing effects of variables that appear non-linearly in the
model using e.g. inter-quartile-range hazard ratios, bootstrapping
model fits, and constructing nomograms for obtaining predictions manually. 
Behind the scene is the \code{Design} function, 
called by a modified version of \code{model.frame.default} to
store extra attributes. \code{Design()} is not intended to be
called by users.  
\code{Design} causes detailed design attributes
and descriptions of the distribution of predictors to be stored 
in an attribute of the \code{terms} component called \code{Design}.
In addition to \code{model.frame.default} being replaced by a modified
version, \code{[.} and \code{[.factor} are replaced by versions which carry
along the \code{label} attribute of a variable.  In this way, when an
\code{na.action} function is called to subset out NAs, labels are still
defined for variables in the model.
}
\synopsis{
Design(mf, allow.offset=TRUE, intercept=1)
}
\usage{
Design(mf, allow.offset=TRUE, intercept=1)
# not to be called by the user; called by fitting routines
# dist <- datadist(x1,x2,sex,age,race,bp)   
# or dist <- datadist(my.data.frame)
# Can omit call to datadist if not using summary.Design, plot.Design, 
# survplot.Design, or if all variable settings are given to them
# options(datadist="dist")
# f <- fitting.function(formula = y ~ rcs(x1,4) + rcs(x2,5) + x1\%ia\%x2 +
#                       rcs(x1,4)\%ia\%rcs(x2,5) +
#                       strat(sex)*age + strat(race)*bp)
# See Design.trans for rcs, strat, etc.
# \%ia\% is restricted interaction - not doubly nonlinear
# for x1 by x2 this uses the simple product only, but pools x1*x2
# effect with nonlinear function for overall tests
# specs(f)
# anova(f)
# summary(f)
# fastbw(f)
# pred <- predict(f, newdata=expand.grid(x1=1:10,x2=3,sex="male",
#                 age=50,race="black"))
# pred <- predict(f, newdata=gendata(f, x1=1:10, x2=3, sex="male"))
# This leaves unspecified variables set to reference values from datadist
# pred.combos <- gendata(f, nobs=10)   # Use X-windows to edit predictor settings
# predict(f, newdata=pred.combos)
# plot(f, x1=NA)
# latex(f)
# nomogram(f)
}
\arguments{
  \item{mf}{a model frame}
  \item{allow.offset}{set to \code{TRUE} if model fitter allows an
	offset term}
  \item{intercept}{1 if an ordinary intercept is present, 0 otherwise}
}
\value{
  a data frame augmented with additional information about the
  predictors and model formulation
  }
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{Design.trans}}, \code{\link{Design.Misc}}, \code{\link{cph}}, \code{\link{lrm}}, \code{\link{ols}}, \code{\link{specs.Design}}, \code{\link{anova.Design}},
\code{\link{summary.Design}}, \code{\link{predict.Design}}, \code{\link{gendata}}, \code{\link{plot.Design}}, \code{\link{fastbw}}, 
\code{\link{validate}}, \code{\link{calibrate}}, \code{\link{which.influence}},
\code{\link[Hmisc]{latex}}, \code{\link{latex.Design}}, \code{\link{model.frame.default}}, \code{\link{datadist}}, \code{\link[Hmisc]{describe}},
\code{\link{nomogram}}, \code{\link{vif}}, \code{\link[Hmisc]{dataRep}}
}
\examples{
\dontrun{
library(Design, first=TRUE)  # omit first for R
dist <- datadist(data=2)     # can omit if not using summary, plot, survplot,
                             # or if specify all variable values to them. Can
                             # also  defer.  data=2: get distribution summaries
                             # for all variables in search position 2
                             # run datadist once, for all candidate variables
dist <- datadist(age,race,bp,sex,height)   # alternative
options(datadist="dist")
f <- cph(Surv(d.time, death) ~ rcs(age,4)*strat(race) +
         bp*strat(sex)+lsp(height,60),x=TRUE,y=TRUE)
anova(f)
anova(f,age,height)          # Joint test of 2 vars
fastbw(f)
summary(f, sex="female")     # Adjust sex to "female" when testing
                             # interacting factor bp
plot(f, age=NA, height=NA)   # 3-D plot
plot(f, age=10:70, height=60)
latex(f)                     # LaTeX representation of fit


f <- lm(y ~ x)               # Can use with any fitting function that
                             # calls model.frame.default, e.g. lm, glm
specs.Design(f)              # Use .Design since class(f)="lm"
anova(f)                     # Works since Varcov(f) (=Varcov.lm(f)) works
fastbw(f)
options(datadist=NULL)
f <- ols(y ~ x1*x2)          # Saves enough information to do fastbw, anova
anova(f)                     # Will not do plot.Design since distributions
fastbw(f)                    # of predictors not saved
plot(f, x1=seq(100,300,by=.5), x2=.5) 
                             # all values defined - don't need datadist
dist <- datadist(x1,x2)      # Equivalent to datadist(f)
options(datadist="dist")
plot(f, x1=NA, x2=.5)        # Now you can do plot, summary
nomogram(f, interact=list(x2=c(.2,.7)))
}
}
\keyword{models}
\keyword{regression}
\keyword{survival}
\keyword{math}
\keyword{manip}
\keyword{methods}
\concept{logistic regression model}

\eof
\name{Design.trans}
\alias{Design.trans}
\alias{asis}
\alias{pol}
\alias{lsp}
\alias{rcs}
\alias{catg}
\alias{scored}
\alias{strat}
\alias{matrx}
\alias{%ia%}
\title{
Design Special Transformation Functions
}
\description{
This is a series of functions (\code{asis}, \code{pol}, \code{lsp}, \code{rcs}, \code{catg},
\code{scored}, \code{strat}, \code{matrx}, and \code{\%ia\%}) that set up special attributes 
(such as
knots and nonlinear term indicators) that are carried through to fits
(using for example \code{lrm},\code{cph}, \code{ols}, \code{psm}). \code{anova.Design}, \code{summary.Design},
\code{plot.Design}, \code{survplot}, \code{fastbw}, \code{validate}, \code{specs}, 
\code{which.influence}, \code{nomogram.Design} and \code{latex.Design} use these
attributes to automate certain analyses (e.g., automatic tests of linearity
for each predictor are done by \code{anova.Design}). Many of the functions
are called implicitly.  Some S functions such as \code{ns} derive data-dependent
transformations that are not "remembered" when predicted values are
later computed, so the predictions will be incorrect. The functions listed
here solve that problem. 


\code{asis} is the identity transformation, \code{pol} is an ordinary (non-orthogonal) polynomial, \code{rcs} is
a linear tail-restricted cubic spline function (natural spline, for which the
\code{rcspline.eval} function generates the design matrix),
\code{catg} is for a categorical
variable, \code{scored} is for an ordered categorical
variable, \code{strat} is for a stratification factor
in a Cox model, \code{matrx} is for a matrix predictor, and \code{\%ia\%} represents
restricted interactions in which products involving nonlinear effects on both
variables are not included in the model.  \code{asis, catg, scored, matrx} are seldom invoked
explicitly by the user (only to specify \code{label} or \code{name}, usually).

In the list below, functions \code{asis} through \code{strat} can have
arguments \code{x, parms, label, name} except that \code{parms} does not
apply to \code{asis, matrx, strat}.
}
\synopsis{
asis(\dots)
matrx(\dots)
pol(\dots)
lsp(\dots)
rcs(\dots)
catg(\dots)
scored(\dots)
strat(\dots)
\%ia\%(x1, x2)
}
\usage{
asis(x, parms, label, name)
matrx(x, label, name)
pol(x, parms, label, name)
lsp(x, parms, label, name)
rcs(x, parms, label, name)
catg(x, parms, label, name)
scored(x, parms, label, name)
strat(x, label, name)
\%ia\%(x1, x2)
}
\arguments{
\item{x}{
a predictor variable (or a function of one).  If you specify e.g.
\code{pol(pmin(age,10),3)}, a cubic polynomial will be fitted in \code{pmin(age,10)}
(\code{pmin} is the S vector element--by--element function).
The predictor will be labeled \code{age} in the output, and plots with have
\code{age} in its original units on the axes. If you use a function such as
\code{pmin}, the predictor is taken as the first argument, and other arguments
must be defined in the frame in effect when predicted values, etc., are
computed.
}
\item{parms}{
parameters of transformation (e.g. number or location of knots).
For \code{pol} the argument is the order of the polynomial,
e.g. \code{2} for quadratic (the usual default). For \code{lsp} it is a
vector of knot locations (\code{lsp} will not estimate knot locations).
For \code{rcs} it is the
number of knots (if scalar), or vector of knot locations (if \code{>2} elements).
The default number is the \code{nknots} system option if \code{parms} is not given.
If the number of knots is given,
locations are computed for that number of knots.
For \code{catg}, \code{parms} is the
category labels (not needed if variable is an S category or factor variable). If
omitted, \code{catg} will use \code{unique(x)}, or \code{levels(x)} if \code{x} is a \code{category}
or a \code{factor}.
For \code{scored}, \code{parms} is a
vector of unique values of variable (uses \code{unique(x)} by default).
This is not needed if \code{x} is an S \code{ordered} variable.
For \code{strat}, \code{parms} is the category labels (not needed if variable is an S category variable). If
omitted, will use \code{unique(x)}, or \code{levels(x)} if \code{x} is
\code{category} or \code{factor}.
\code{parms} is not used for \code{matrix}.
}
\item{label}{
label of predictor for plotting (default = \code{"label"} attribute or variable
name)
}
\item{name}{
Name to use for predictor in model. Default is name of argument to
function
}
\item{x1}{}
\item{x2}{two continuous variables for which to form a
  non-doubly-nonlinear interaction}
\item{\dots}{a variety of things}
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[Hmisc]{rcspline.eval}}, \code{\link[Hmisc]{rcspline.restate}}, \code{\link{Design}}, \code{\link{cph}}, \code{\link{lrm}}, \code{\link{ols}}, \code{\link{datadist}}
}
\examples{
\dontrun{
options(knots=4, poly.degree=2)
country <- factor(country.codes)
blood.pressure <- cbind(sbp=systolic.bp, dbp=diastolic.bp)
fit <- lrm(Y ~ sqrt(x1)*rcs(x2) + rcs(x3,c(5,10,15)) + 
       lsp(x4,c(10,20)) + country + blood.pressure + poly(age,2))
# sqrt(x1) is an implicit asis variable, but limits of x1, not sqrt(x1)
#       are used for later plotting and effect estimation
# x2 fitted with restricted cubic spline with 4 default knots
# x3 fitted with r.c.s. with 3 specified knots
# x4 fitted with linear spline with 2 specified knots
# country is an implied catg variable
# blood.pressure is an implied matrx variable
# since poly is not a Design function (pol is), it creates a
#       matrx type variable with no automatic linearity testing
#       or plotting
f1 <- lrm(y ~ rcs(x1) + rcs(x2) + rcs(x1) \%ia\% rcs(x2))
# \%ia\% restricts interactions. Here it removes terms nonlinear in
# both x1 and x2
f2 <- lrm(y ~ rcs(x1) + rcs(x2) + x1 \%ia\% rcs(x2))
# interaction linear in x1
f3 <- lrm(y ~ rcs(x1) + rcs(x2) + x1 \%ia\% x2)
# simple product interaction (doubly linear)
# Use x1 \%ia\% x2 instead of x1:x2 because x1 \%ia\% x2 triggers
# anova to pool x1*x2 term into x1 terms to test total effect
# of x1
}
}
\keyword{models}
\keyword{regression}
\keyword{math}
\keyword{manip}
\keyword{methods}
\keyword{survival}
\keyword{smooth}
\concept{logistic regression model}
\concept{transformation}

\eof
\name{Function}
\alias{Function.Design}
\alias{Function.cph}
\alias{sascode}
\title{
Compose an S Function to Compute X beta from a Fit
}
\description{
\code{Function} is a class of functions for creating other S functions.
\code{Function.Design} is the method for creating S functions to compute
X beta, based on a model fitted with \code{Design} in effect.  
Like \code{latex.Design}, \code{Function.Design} simplifies restricted cubic
spline functions and factors out terms in second-order interactions.
\code{Function.Design} will not work for models that have third-order
interactions involving restricted cubic splines.
\code{Function.cph} is a particular method for handling fits from \code{cph}, for
which an intercept (the negative of the centering constant) is added to 
the model.  \code{sascode} is a function that takes an S function such
as one created by \code{Function} and does most of the editing
to turn the function definition into
a fragment of SAS code for computing X beta from the fitted model, along
with assignment statements that initialize predictors to reference values.
}
\usage{
\method{Function}{Design}(object, intercept=NULL, digits=max(8,
.Options$digits), \dots)
\method{Function}{cph}(object, intercept=-object$center, \dots)

# Use result as fun(predictor1=value1, predictor2=value2, \dots)

sascode(object, file='', append=FALSE)
}
\arguments{
\item{object}{
a fit created with \code{Design} in effect
}
\item{intercept}{
an intercept value to use (not allowed to be specified to \code{Function.cph}).
The intercept is usually retrieved from the regression coefficients
automatically.
}
\item{digits}{
number of significant digits to use for coefficients and knot locations
}
\item{file}{
name of a file in which to write the SAS code.  Default is to write to
standard output.
}
\item{append}{
set to \code{TRUE} to have \code{sascode} append code to an existing file named
\code{file}.
}
\item{\dots}{arguments to pass to \code{Function.Design} from
  \code{Function.cph}}
}
\value{
\code{Function} returns an S-Plus function that can be invoked in any
usual context.  The function has one argument per predictor variable,
and the default values of the predictors are set to \code{adjust-to} values
(see \code{datadist}).  Multiple predicted X beta values may be calculated
by specifying vectors as arguments to the created function.
All non-scalar argument values must have the same length.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{latex.Design}}, \code{\link[Hmisc]{Function.transcan}}, \code{\link{predict.Design}}, \code{\link{Design}}, \code{\link{Design.trans}}
}
\examples{
set.seed(1331)
x1 <- exp(rnorm(100))
x2 <- factor(sample(c('a','b'),100,rep=TRUE))
dd <- datadist(x1, x2)
options(datadist='dd')
y  <- log(x1)^2+log(x1)*(x2=='b')+rnorm(100)/4
f  <- ols(y ~ pol(log(x1),2)*x2)
f$coef
g  <- Function(f, digits=5)
g
sascode(g)
g()
g(x1=c(2,3), x2='b')   #could omit x2 since b is default category
predict(f, expand.grid(x1=c(2,3),x2='b'))
g8 <- Function(f)   # default is 8 sig. digits
g8(x1=c(2,3), x2='b')
options(datadist=NULL)


\dontrun{
# Make self-contained functions for computing survival probabilities
# using a log-normal regression
f <- psm(Surv(d.time, death) ~ rcs(age,4)*sex, dist='gaussian')
g <- Function(f)
surv <- Survival(f)
# Compute 2 and 5-year survival estimates for 50 year old male
surv(c(2,5), g(age=50, sex='male'))
}
}
\keyword{regression}
\keyword{methods}
\keyword{interface}
\keyword{models}
\keyword{survival}
\keyword{math}
\concept{logistic regression model}

\eof
\name{Overview}
\alias{Overview}
\alias{Design.Overview}
\title{
	Overview of Design Library
}
\description{
Design does regression modeling,
testing, estimation, validation, graphics,
prediction, and typesetting by storing enhanced model
design attributes in the fit.

Design is a collection of about 180 functions that assist and
streamline modeling, especially for biostatistical and epidemiologic
applications.  It also contains new functions for binary and ordinal
logistic regression models and the Buckley-James multiple regression
model for right-censored responses, and implements penalized maximum
likelihood estimation for logistic and ordinary linear models.  Design
works with almost any regression model, but it was especially written
to work with logistic regression, Cox regression, accelerated failure
time models, ordinary linear models, and the Buckley-James model.
You should install the Hmisc library before using
Design, as a few of Design's options use Hmisc functions, and Hmisc
has several functions useful for data analysis (especially data
reduction and imputation).
}

\section{Statistical Methods Implemented}{
\itemize{
\item Ordinary linear regression models
\item Binary and ordinal logistic models (proportional odds
  and continuation ratio models)
\item Cox model
\item Parametric survival models in the accelerated failure
  time class
\item Buckley-James least-squares linear regression model
  with possibly right-censored responses
\item Bootstrap model validation to obtain unbiased
  estimates of model performance without requiring a
  separate validation sample
\item Automatic Wald tests of all effects in the model that
  are not parameterization-dependent (e.g., tests of
  nonlinearity of main effects when the variable does
  not interact with other variables, tests of
  nonlinearity of interaction effects, tests for
  whether a predictor is important, either as a main
  effect or as an effect modifier)
\item Graphical depictions of model estimates (effect
  plots, odds/hazard ratio plots, nomograms that
  allow model predictions to be obtained manually even
  when there are nonlinear effects and interactions
  in the model)
\item Various smoothed residual plots, including some new
  residual plots for verifying ordinal logistic model
  assumptions
\item Composing S functions to evaluate the linear
  predictor (\eqn{X\hat{beta}}{X*beta hat}), hazard function, survival
  function, quantile functions analytically from the
  fitted model
\item Typesetting of fitted model using LaTeX
\item Robust covariance matrix estimation (Huber or
  bootstrap)
\item Cubic regression splines with linear tail restrictions (natural splines)
\item Tensor splines
\item Interactions restricted to not be doubly nonlinear
\item Penalized maximum likelihood estimation for ordinary
  linear regression and logistic regression models.
  Different parts of the model may be penalized by
  different amounts, e.g., you may want to penalize
  interaction or nonlinear effects more than main
  effects or linear effects
\item Estimation of hazard or odds ratios in presence of
  nonlinearity and interaction
\item Sensitivity analysis for an unmeasured binary confounder in a
  binary logistic model
\item Multiple imputation of repeated measures data with non-
  random dropout using propensity score matching (experimental, not yet
  functional)
}
}

\section{Motivation}{
Design was motivated by the following needs:
\itemize{
\item need to automatically print interesting Wald tests that can be
constructed from the design
  \itemize{
  \item tests of linearity with respect to each predictor
  \item tests of linearity of interactions
  \item pooled interaction tests (e.g., all interactions involving race)
  \item pooled tests of effects with higher order effects
    \itemize{
    \item test of main effect not meaningful when effect in interaction
    \item pooled test of main effect + interaction effect is meaningful
    \item test of 2nd-order interaction + any 3rd-order interaction containing
    those factors is meaningful
  }
  }

\item need to store transformation parameters with the fit
\itemize{
  \item example: knot locations for spline functions
  \item these are "remembered" when getting predictions, unlike standard
  S or \R
  \item  for categorical predictors, save levels so that same dummy variables
    will be generated for predictions; check that all levels in out-of-data
    predictions were present when model was fitted
  }

\item need for uniform re-insertion of observations deleted because of NAs
  when using \code{predict} without \code{newdata} or when using
  \code{resid}

\item need to easily plot the regression effect of any predictor
  \itemize{
  \item example: age is represented by a linear spline with knots at 40 and 60y
             plot effect of age on log odds of disease, adjusting
             interacting factors to easily specified constants
  \item  vary 2 predictors: plot x1 on x-axis, separate curves for discrete
             x2 or 3d perspective plot for continuous x2
  \item if predictor is represented as a function in the model, plots
    should be with respect to the original variable:\cr
             \code{f <- lrm(y ~ log(cholesterol)+age)} \cr
             \code{plot(f, cholesterol=NA)   # cholesterol on x-axis, default range}
}

\item need to store summary of distribution of predictors with the fit
  \itemize{
  \item plotting limits (default: 10th smallest, 10th largest values or \%-tiles)
  \item effect limits   (default: .25 and .75 quantiles for continuous vars.)
  \item  adjustment values for other predictors (default: median for continuous
    predictors, most frequent level for categorical ones)
  \item discrete numeric predictors: list of possible values
    example: x=0,1,2,3,5 -> by default don't plot prediction at x=4
  \item values are on the inner-most variable, e.g. cholesterol, not log(chol.)
  \item allows estimation/plotting long after original dataset has been deleted
  \item  for Cox models, underlying survival also stored with fit, so original
  data not needed to obtain predicted survival curves
  }

\item need to automatically print estimates of effects in presence of non-
linearity and interaction
  \itemize{
  \item example: age is quadratic, interacting with sex
             default effect is inter-quartile-range hazard ratio (for
             Cox model), for sex=reference level
  \item user-controlled effects: \code{summary(fit, age=c(30,50),
	sex="female")} -> odds ratios for logistic model, relative survival time
                for accelerated failure time survival models
  \item effects for all variables (e.g. odds ratios) may be plotted with
    multiple-confidence-level bars
}

\item need for prettier and more concise effect names in printouts,
especially for expanded nonlinear terms and interaction terms
 \itemize{
  \item use inner-most variable name to identify predictors
  \item e.g. for \code{pmin(x^2-3,10)} refer to factor with legal S-name
  \code{x}
  }

\item need to recognize that an intercept is not always a simple
  concept
  \itemize{  
  \item some models (e.g., Cox) have no intercept
  \item some models (e.g., ordinal logistic) have multiple intercepts
}

\item need for automatic high-quality printing of fitted mathematical
  model (with dummy variables defined, regression spline terms
  simplified, interactions "factored").  Focus is on regression splines
  instead of nonparametric smoothers or smoothing splines, so that
  explicit formulas for fit may be obtained for use outside S.
  Design can also compose S functions to evaluate \eqn{X\beta}{X*Beta} from
  the fitted model analytically, as well as compose SAS code to
  do this.

\item need for automatic drawing of nomogram to represent the fitted model

\item need for automatic bootstrap validation of a fitted model, with
  only one S command (with respect to calibration and discrimination)

\item need for robust (Huber sandwich) estimator of covariance matrix,
  and be able to do all other analysis (e.g., plots, C.L.) using the
  adjusted covariances

\item need for robust (bootstrap) estimator of covariance matrix, easily
  used in other analyses without change

\item need for Huber sandwich and bootstrap covariance matrices adjusted
  for cluster sampling

\item need for routine reporting of how many observations were deleted
  by missing values on each predictor (see \code{na.delete} in Hmisc)

\item need for optional reporting of descriptive statistics for Y stratified
  by missing status of each X (see na.detail.response)

\item need for pretty, annotated survival curves, using the same commands
  for parametric and Cox models

\item need for ordinal logistic model (proportional odds model, continuation
  ratio model)
}}

\details{
To make use of automatic typesetting features you must
have LaTeX or one of its variants installed.\cr

Some aspects of Design (e.g., \code{latex}) will not work correctly if
\code{options(contrasts=)} other than \code{c("contr.treatment",
  "contr.poly")} are used.

Design relies on a wealth of survival analysis
functions written by Terry Therneau of Mayo Clinic.
Front-ends have been written for several of
Therneau's functions, and other functions have been
slightly modified.
}

\section{Fitting Functions Compatible with Design}{
Design will work with a wide variety of fitting
functions, but it is meant especially for the
following:
\tabular{lll}{
\bold{Function} \tab \bold{Purpose} \tab  \bold{Related S}\cr
                \tab                \tab  \bold{Functions}\cr
\bold{\code{ols}}         \tab Ordinary least squares linear model     \tab \code{lm}\cr
\bold{\code{lrm}}         \tab Binary and ordinal logistic regression  \tab \code{glm}\cr
            \tab model                                   \tab \code{cr.setup}\cr
\bold{\code{psm}}         \tab Accelerated failure time parametric     \tab \code{survreg}\cr
            \tab survival model                          \tab \cr
\bold{\code{cph}}         \tab Cox proportional hazards regression     \tab \code{coxph}\cr
\bold{\code{bj}}          \tab Buckley-James censored least squares    \tab \code{survreg}\cr
            \tab linear model                            \tab \cr
\bold{\code{glmD}}        \tab Version of glm for use with Design \tab \cr
\bold{\code{glsD}}        \tab Version of gls for use with Design \tab \cr
}
}

\section{Methods in Design}{
The following generic functions work with fits with Design in effect:
\tabular{lll}{
\bold{Function}           \tab  \bold{Purpose} \tab \bold{Related}\cr
                          \tab                 \tab \bold{Functions}\cr
\bold{\code{print}}       \tab Print parameters and statistics of fit \tab \cr
\bold{\code{coef}}        \tab Fitted regression coefficients  \tab \cr
\bold{\code{formula}}     \tab Formula used in the fit \tab \cr
\bold{\code{specs}}       \tab Detailed specifications of fit \tab \cr
\bold{\code{robcov}}      \tab Robust covariance matrix estimates \tab \cr
\bold{\code{bootcov}}     \tab Bootstrap covariance matrix estimates \tab \cr
\bold{\code{summary}}     \tab Summary of effects of predictors \tab \cr
\bold{\code{plot.summary}} \tab Plot continuously shaded confidence \tab \cr
                          \tab bars for results of summary  \tab \cr
\bold{\code{anova}}       \tab Wald tests of most meaningful hypotheses \tab \cr
\bold{\code{contrast}}    \tab General contrasts, C.L., tests           \tab \cr
\bold{\code{plot.anova}}  \tab Depict results of anova graphically      \tab \code{dotchart}     \cr
\bold{\code{plot}}        \tab Plot effects of predictors \tab \cr
\bold{\code{gendata}}     \tab Generate data frame with predictor       \tab \code{expand.grid} \cr
                          \tab combinations (optionally interactively) \tab \cr
\bold{\code{predict}}     \tab Obtain predicted values or design matrix \tab \cr
\bold{\code{fastbw}}      \tab Fast backward step-down variable            \tab \code{step} \cr
                          \tab selection \tab \cr
\bold{\code{residuals}}   \tab Residuals, influence statistics from fit \tab \cr
(or \bold{\code{resid}})  \tab                         \tab \cr
\bold{\code{which.influence}} 
                          \tab Which observations are overly               \tab \code{residuals} \cr
                          \tab influential \tab \cr
\bold{\code{sensuc}}      \tab Sensitivity of one binary predictor in \tab \cr
                          \tab lrm and cph models to an unmeasured \tab \cr
                          \tab binary confounder \tab \cr
\bold{\code{latex}}       \tab LaTeX representation of fitted              \tab \cr
                          \tab model or \code{anova} or \code{summary} table \tab \cr
\bold{\code{Function}}    \tab S function analytic representation          \tab \code{Function.transcan} \cr
                          \tab of a fitted regression model (\eqn{X\beta}{X*Beta}) \tab \cr
\bold{\code{hazard}}      \tab S function analytic representation          \tab \code{rcspline.restate} \cr
            \tab of a fitted hazard function (for \code{psm}) \tab \cr
\bold{\code{Survival}}    \tab S function analytic representation of \tab \cr
                          \tab fitted survival function (for \code{psm,cph}) \tab \cr
\bold{\code{Quantile}}    \tab S function analytic representation of \tab \cr
                          \tab fitted function for quantiles of \tab \cr
                          \tab survival time (for \code{psm, cph}) \tab \cr
\bold{\code{nomogram}}    \tab Draws a nomogram for the fitted model       \tab \code{latex, plot} \cr
\bold{\code{survest}}     \tab Estimate survival probabilities             \tab \code{survfit} \cr
                          \tab  (for \code{psm, cph}) \tab \cr
\bold{\code{survplot}}    \tab Plot survival curves (psm, cph)             \tab plot.survfit \cr
\bold{\code{validate}}    \tab Validate indexes of model fit using         \tab val.prob \cr
                          \tab resampling \tab \cr
\bold{\code{calibrate}}   \tab Estimate calibration curve for model \tab \cr
                          \tab using resampling \tab \cr
\bold{\code{vif}}         \tab Variance inflation factors for a fit \tab \cr
\bold{\code{naresid}}     \tab Bring elements corresponding to missing  \tab \cr
                          \tab data back into predictions and residuals \tab \cr
\bold{\code{naprint}}     \tab Print summary of missing values \tab \cr
\bold{\code{pentrace}}    \tab Find optimum penality for penalized MLE \tab \cr
\bold{\code{effective.df}}
                          \tab Print effective d.f. for each type of  \tab \cr
                          \tab variable in model, for penalized fit or  \tab \cr
                          \tab pentrace result \tab \cr
\bold{\code{rm.impute}}   \tab Impute repeated measures data with     \tab \code{transcan}, \cr
                          \tab non-random dropout \tab \code{fit.mult.impute} \cr
                          \tab \emph{experimental, non-functional} \tab
  }
}

\section{Background for Examples}{
The following programs demonstrate how the pieces of
the Design package work together.  A (usually)
one-time call to the function \code{datadist} requires a
pass at the entire data frame to store distribution
summaries for potential predictor variables.  These
summaries contain (by default) the .25 and .75
quantiles of continuous variables (for estimating
effects such as odds ratios), the 10th smallest and
10th largest values (or .1 and .9 quantiles for small
\eqn{n}) for plotting ranges for estimated curves, and the
total range.  For discrete numeric variables (those
having \eqn{\leq 10}{<=10} unique values), the list of unique values
is also stored.  Such summaries are used by the
\code{summary.Design, plot.Design}, and \code{nomogram.Design}
functions.  You may save time and defer running
\code{datadist}.  In that case, the distribution summary
is not stored with the fit object, but it can be
gathered before running \code{summary} or \code{plot}.

\code{d <- datadist(my.data.frame) # or datadist(x1,x2)}\cr
\code{options(datadist="d")        # omit this or use options(datadist=NULL)}\cr
\code{                             # if not run datadist yet}\cr
\code{cf <- ols(y ~ x1 * x2)}\cr
\code{anova(f)}\cr
\code{fastbw(f)}\cr
\code{predict(f, newdata)}

In the \bold{Examples} section there are three detailed examples using a
fitting function 
designed to be used with Design, \code{lrm} (logistic
regression model).  In \bold{Detailed Example 1} we
create 3 predictor variables and a two binary response
on 500 subjects.  For the first binary response, \code{dz},
the true model involves only \code{sex} and \code{age}, and there is
a nonlinear interaction between the two because the log
odds is a truncated linear relationship in \code{age} for
females and a quadratic function for males.  For the
second binary outcome, \code{dz.bp}, the true population model
also involves systolic blood pressure (\code{sys.bp}) through
a truncated linear relationship.  First, nonparametric
estimation of relationships is done using the Hmisc
library's \code{plsmo} function which uses \code{lowess} with outlier
detection turned off for binary responses.  Then
parametric modeling is done using restricted cubic
splines.  This modeling does not assume that we know
the true transformations for \code{age} or \code{sys.bp} but that
these transformations are smooth (which is not actually
the case in the population).

For \bold{Detailed Example 2}, suppose that a
categorical variable treat has values \code{"a", "b"}, and
\code{"c"}, an ordinal variable \code{num.diseases} has values
0,1,2,3,4, and that there are two continuous variables,
\code{age} and \code{cholesterol}.  \code{age} is fitted with a restricted
cubic spline, while \code{cholesterol} is transformed using
the transformation \code{log(cholesterol - 10)}.  Cholesterol
is missing on three subjects, and we impute these using
the overall median cholesterol.  We wish to allow for
interaction between \code{treat} and \code{cholesterol}.  The
following S program will fit a logistic model,
test all effects in the design, estimate effects, and
plot estimated transformations. The fit for
\code{num.diseases} really considers the variable to be a
5-level categorical variable. The only difference is
that a 3 d.f. test of linearity is done to assess
whether the variable can be re-modeled "asis".  Here
we also show statements to attach the Design library
and store predictor characteristics from datadist.

\bold{Detailed Example 3} shows some of the survival
analysis capabilities of Design related to the Cox
proportional hazards model.  We simulate data for 2000
subjects with 2 predictors, \code{age} and \code{sex}.  In the true
population model, the log hazard function is linear in
\code{age} and there is no \code{age} \eqn{\times}{x} \code{sex} interaction.  In the 
analysis below we do not make use of the linearity in
age.  Design makes use of many of Terry Therneau's
survival functions that are builtin to S.

The following is a typical sequence of steps that
would be used with Design in conjunction with the Hmisc
\code{transcan} function to do single imputation of all NAs in the
predictors (multiple imputation would be better but would be
harder to do in the context of bootstrap model validation),
fit a model, do backward stepdown to reduce the number of
predictors in the model (with all the severe problems this can
entail), and use the bootstrap to validate this stepwise model,
repeating the variable selection for each re-sample.  Here we
take a short cut as the imputation is not repeated within the
bootstrap.

In what follows we (atypically) have only 3
candidate predictors.  In practice be sure to have the
validate and calibrate functions operate on a model fit that
contains all predictors that were involved in previous analyses
that used the response variable.  Here the imputation
is necessary because backward stepdown would otherwise delete
observations missing on any candidate variable.

Note that you would have to define \code{x1, x2, x3, y} to run
the following code.

\code{xt <- transcan(~ x1 + x2 + x3, imputed=TRUE)}\cr
\code{impute(xt)  # imputes any NAs in x1, x2, x3}\cr
\code{# Now fit original full model on filled-in data}\cr
\code{f <- lrm(y ~ x1 + rcs(x2,4) + x3, x=TRUE, y=TRUE) #x,y allow boot.}\cr
\code{fastbw(f)}\cr
\code{# derives stepdown model (using default stopping rule)}\cr
\code{validate(f, B=100, bw=TRUE) # repeats fastbw 100 times}\cr
\code{cal <- calibrate(f, B=100, bw=TRUE)  # also repeats fastbw}\cr
\code{plot(cal)}
}

\examples{
######################
# Detailed Example 1 #
######################
# May want to first invoke the Hmisc store function
# so that new variables will go into a temporary directory
set.seed(17)  # So can repeat random number sequence
n <- 500

sex    <- factor(sample(c('female','male'), n, rep=TRUE))
age    <- rnorm(n, 50, 10)
sys.bp <- rnorm(n, 120, 7)

# Use two population models, one with a systolic
# blood pressure effect and one without

L    <- ifelse(sex=='female', .1*(pmin(age,50)-50), .005*(age-50)^2)
L.bp <- L + .4*(pmax(sys.bp,120)-120)

dz    <- ifelse(runif(n) <= plogis(L),    1, 0)
dz.bp <- ifelse(runif(n) <= plogis(L.bp), 1, 0)

# Use summary.formula in the Hmisc library to summarize the
# data one predictor at a time

s <- summary(dz.bp ~ age + sex + sys.bp) 
options(digits=3)
print(s)
plot(s)

plsmo(age, dz, group=sex, fun=qlogis, ylim=c(-3,3))
plsmo(age, L,  group=sex, method='raw', add=TRUE, prefix='True', trim=0)
title('Lowess-smoothed Estimates with True Regression Functions')

dd <- datadist(age, sex, sys.bp)
options(datadist='dd')
# can also do: dd <- datadist(dd, newvar)

f <- lrm(dz ~ rcs(age,5)*sex, x=TRUE, y=TRUE)
f
# x=TRUE, y=TRUE for pentrace

fpred <- Function(f)
fpred
fpred(age=30, sex=levels(sex))

anova(f)

p <- plot(f, age=NA, sex=NA, conf.int=FALSE, ylim=c(-3,3))
datadensity(p, age, sex)
scat1d(age)

plsmo(age, L, group=sex, method='raw', add=TRUE, prefix='True', trim=0)
title('Spline Fits with True Regression Functions')

f.bp <- lrm(dz.bp ~ rcs(age,5)*sex + rcs(sys.bp,5))

for(method in c('persp','image')) 
  p <- plot(f.bp, age=NA, sys.bp=NA, method=method)
# Legend(p)   # NOTE: Needs subplot - not in R

cat('Doing 25 bootstrap repetitions to validate model\n')
validate(f, B=25)   # in practice try to use 150

cat('Doing 25 bootstrap reps to check model calibration\n')
cal <- calibrate(f, B=25)   # use 150 in practice
plot(cal)
title('Calibration of Unpenalized Model')

p <- if(.R.) pentrace(f, penalty=c(.009,.009903,.02,.2,.5,1)) else
             pentrace(f, penalty=1, method='optimize')

f <- update(f, penalty=p$penalty)
f
specs(f,long=TRUE)
edf <- effective.df(f)

p <- plot(f, age=NA, sex=NA, conf.int=FALSE, ylim=c(-3,3))
datadensity(p, age, sex)
scat1d(age)

plsmo(age, L, group=sex, method='raw', add=TRUE, prefix='True', trim=0)
title('Penalized Spline Fits with True Regression Functions')

options(digits=3)
s <- summary(f)
s
plot(s)

s <- summary(f, sex='male')
plot(s)

fpred <- Function(f)
fpred
fpred(age=30, sex=levels(sex))
sascode(fpred)

cat('Doing 40 bootstrap reps to validate penalized model\n')
validate(f, B=40)

cat('Doing 40 bootstrap reps to check penalized model calibration\n')
cal <- calibrate(f, B=40)
plot(cal)
title('Calibration of Penalized Model')

nomogram(f.bp, fun=plogis,
         funlabel='Prob(dz)',
         fun.at=c(.15,.2,.3,.4,.5,.6,.7,.8,.9,.95,.975),
         fun.side=c(1,3,1,3,1,3,1,3,1,3,1))
options(datadist=NULL)

#####################
#Detailed Example 2 #
#####################
# Simulate the data.  
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
treat <- factor(sample(c('a','b','c'), n, TRUE))
num.diseases <- sample(0:4, n, TRUE)
age <- rnorm(n, 50, 10)
cholesterol <- rnorm(n, 200, 25)
weight <- rnorm(n, 150, 20)
sex <- factor(sample(c('female','male'), n, TRUE))
label(age) <- 'Age'      # label is in Hmisc
label(num.diseases) <- 'Number of Comorbid Diseases'
label(cholesterol) <- 'Total Cholesterol'
label(weight) <- 'Weight, lbs.'
label(sex) <- 'Sex'
units(cholesterol) <- 'mg/dl'   # uses units.default in Hmisc


# Specify population model for log odds that Y=1
L <- .1*(num.diseases-2) + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(treat=='a') +
  3.5*(treat=='b')+2*(treat=='c'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)
cholesterol[1:3] <- NA   # 3 missings, at random

ddist <- datadist(cholesterol, treat, num.diseases,
                  age, weight, sex)
# Could have used ddist <- datadist(data.frame.name)
options(datadist="ddist") # defines data dist. to Design
cholesterol <- impute(cholesterol) # see impute in Hmisc library
# impute, describe, and several other basic functions are
# distributed as part of the Hmisc library


fit <- lrm(y ~ treat*log(cholesterol - 10) +
           scored(num.diseases) +  rcs(age))


describe(y ~ treat + scored(num.diseases) + rcs(age))
# or use describe(formula(fit)) for all variables used in fit
# describe function (in Hmisc) gets simple statistics on variables
#fit <- robcov(fit) # Would make all statistics which follow
                    # use a robust covariance matrix
                    # would need x=TRUE, y=TRUE in lrm
specs(fit) # Describe the design characteristics
a <- anova(fit)
print(a, which='subscripts')          # print which parameters being tested
plot(anova(fit)) # Depict Wald statistics graphically
anova(fit, treat, cholesterol) # Test these 2 by themselves
summary(fit) # Estimate effects using default ranges
plot(summary(fit)) # Graphical display of effects with C.L.
summary(fit, treat="b", age=60) 
# Specify reference cell and adjustment val


summary(fit, age=c(50,70)) # Estimate effect of increasing age from
                           # 50 to 70
summary(fit, age=c(50,60,70)) # Increase age from 50 to 70, 
                              # adjust to 60 when estimating 
                              # effects of other factors
# If had not defined datadist, would have to define
# ranges for all var.


# Estimate and test treatment (b-a) effect averaged
# over 3 cholesterols
contrast(fit, list(treat='b',cholesterol=c(150,200,250)),
              list(treat='a',cholesterol=c(150,200,250)),
         type='average')
# Remove type='average' to get 3 separate contrasts for b-a


# Plot effects.  plot(fit) plots effects of all predictors,
# showing values used for interacting factors as subtitles
# The ref.zero parameter is helpful for showing effects of
# predictors on a common scale for comparison of strength
plot(fit, ref.zero=TRUE, ylim=c(-2,2))


plot(fit, age=seq(20,80,length=100), treat=NA, conf.int=FALSE)
# Plots relationship between age and log
# odds, separate curve for each treat, no C.I.
plot(fit, age=NA, cholesterol=NA)
# 3-dimensional perspective plot for age, cholesterol, and
# log odds using default ranges for both variables
plot(fit, num.diseases=NA, fun=function(x) 1/(1+exp(-x)),  #or fun=plogis
     ylab="Prob", conf.int=.9)   
# Plot estimated probabilities instead of log odds
# Again, if no datadist were defined, would have to
# tell plot all limits
logit <- predict(fit, expand.grid(treat="b",num.diseases=1:3,
                 age=c(20,40,60),
                 cholesterol=seq(100,300,length=10)))
#logit <- predict(fit, gendata(fit, nobs=12))
# Interactively specify 12 predictor combinations using UNIX
# For UNIX or Windows, generate 9 combinations with other variables
# set to defaults, get predicted values
logit <- predict(fit, gendata(fit, age=c(20,40,60),
                 treat=c('a','b','c')))


# Since age doesn't interact with anything, we can quickly and
# interactively try various transformations of age,
# taking the spline function of age as the gold standard. We are
# seeking a linearizing transformation.  Here age is linear in the
# population so this is not very productive.  Also, if we simplify the
# model the total degrees of freedom will be too small and
# confidence limits too narrow


ag <- 10:80
logit <- predict(fit, expand.grid(treat="a",
                 num.diseases=0, age=ag,
                 cholesterol=median(cholesterol)),
                 type="terms")[,"age"]
# Note: if age interacted with anything, this would be the age
#		"main effect" ignoring interaction terms
# Could also use
#   logit <- plot(f, age=ag, \dots)$x.xbeta[,2]
# which allows evaluation of the shape for any level
# of interacting factors.  When age does not interact with
# anything, the result from
# predict(f, \dots, type="terms") would equal the result from
# plot if all other terms were ignored
# Could also use
#   logit <- predict(fit, gendata(fit, age=ag, cholesterol=median\dots))


plot(ag^.5, logit)  # try square root vs. spline transform.
plot(ag^1.5, logit) # try 1.5 power


# w <- latex(fit)  # invokes latex.lrm, creates fit.tex
# print(w)         # display or print model on screen


# Draw a nomogram for the model fit
nomogram(fit, fun=plogis, funlabel="Prob[Y=1]")


# Compose S function to evaluate linear predictors from fit
g <- Function(fit)
g(treat='b', cholesterol=260, age=50)
# Leave num.diseases at reference value


# Use the Hmisc dataRep function to summarize sample
# sizes for subjects as cross-classified on 2 key
# predictors
drep <- dataRep(~ roundN(age,10) + num.diseases)
print(drep, long=TRUE)

# Some approaches to making a plot showing how
# predicted values vary with a continuous predictor
# on the x-axis, with two other predictors varying

fit <- lrm(y ~ log(cholesterol - 10) + 
           num.diseases + rcs(age) + rcs(weight) + sex)


combos <- gendata(fit, age=10:100,
                  cholesterol=c(170,200,230),
                  weight=c(150,200,250))
# num.diseases, sex not specified -> set to mode
# can also used expand.grid


combos$pred <- predict(fit, combos)
library(lattice)
xyplot(pred ~ age | cholesterol*weight, data=combos)
xYplot(pred ~ age | cholesterol, groups=weight,
       data=combos, type='l') # in Hmisc
xYplot(pred ~ age, groups=interaction(cholesterol,weight),
       data=combos, type='l')


# Can also do this with plot.Design but a single
# plot may be busy:
ch <- c(170, 200, 230)
plot(fit, age=NA, cholesterol=ch, weight=150,
     conf.int=FALSE)
plot(fit, age=NA, cholesterol=ch, weight=200,
     conf.int=FALSE, add=TRUE)
plot(fit, age=NA, cholesterol=ch, weight=250,
     conf.int=FALSE, add=TRUE)


#Here we use plot.Design to make 9 separate plots, with CLs
d <- expand.grid(cholesterol=c(170,200,230),
                 weight=c(150,200,250))
for(i in 1:nrow(d)) {
  plot(fit, age=NA, cholesterol=d$cholesterol[i],
       weight=d$weight[i])
  title(paste('Chol=',format(d$cholesterol[i]),' ',
              'Wt=',format(d$weight[i]),sep=''))
}
options(datadist=NULL)

######################
# Detailed Example 3 #
######################
n <- 2000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n, 
              rep=TRUE, prob=c(.6, .4)))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
t <- -log(runif(n))/h
label(t) <- 'Follow-up Time'
e <- ifelse(t<=cens,1,0)
t <- pmin(t, cens)
units(t) <- "Year"
age.dec <- cut2(age, g=10, levels.mean=TRUE)
dd <- datadist(age, sex, age.dec)
options(datadist='dd')
Srv <- Surv(t,e)


# Fit a model that doesn't assume anything except
# that deciles are adequate representations of age
f <- cph(Srv ~ strat(age.dec)+strat(sex), surv=TRUE)
# surv=TRUE speeds up computations, and confidence limits when
# there are no covariables are still accurate.


# Plot log(-log 3-year survival probability) vs. mean age
# within age deciles and vs. sex
plot(f, age.dec=NA, sex=NA, time=3, 
     loglog=TRUE, val.lev=TRUE, ylim=c(-5,-1))


# Fit a model assuming proportional hazards for age and
# absence of age x sex interaction
f <- cph(Srv ~ rcs(age,4)+strat(sex), surv=TRUE)
survplot(f, sex=NA, n.risk=TRUE)
# Add ,age=60 after sex=NA to tell survplot use age=60
# Validate measures of model performance using the bootstrap
# First must add data (design matrix and Srv) to fit object
f <- update(f, x=TRUE, y=TRUE)
validate(f, B=10, dxy=TRUE, u=5)  # use t=5 for Dxy (only)
# Use B=150 in practice
# Validate model for accuracy of predicting survival at t=1
# Get Kaplan-Meier estimates by divided subjects into groups
# of size 200 (for other values of u must put time.inc=u in
# call to cph)
cal <- calibrate(f, B=10, u=1, m=200)  # B=150 in practice
plot(cal)
# Check proportional hazards assumption for age terms
z <- cox.zph(f, 'identity')
print(z); plot(z)


# Re-fit this model without storing underlying survival
# curves for reference groups, but storing raw data with
# the fit (could also use f <- update(f, surv=FALSE, x=TRUE, y=TRUE))
f <- cph(Srv ~ rcs(age,4)+strat(sex), x=TRUE, y=TRUE) 
# Get accurate C.L. for any age
# Note: for evaluating shape of regression, we would not ordinarily
# bother to get 3-year survival probabilities - would just use X * beta
# We do so here to use same scale as nonparametric estimates
f
anova(f)
ages <- seq(20, 80, by=4)   # Evaluate at fewer points. Default is 100
                            # For exact C.L. formula n=100 -> much memory
plot(f, age=ages, sex=NA, time=3, loglog=TRUE, ylim=c(-5,-1))


# Fit a model assuming proportional hazards for age but
# allowing for general interaction between age and sex
f <- cph(Srv ~ rcs(age,4)*strat(sex), x=TRUE, y=TRUE)
anova(f)
ages <- seq(20, 80, by=6)   
# Still fewer points - more parameters in model


# Plot 3-year survival probability (log-log and untransformed)
# vs. age and sex, obtaining accurate confidence limits
plot(f, age=ages, sex=NA, time=3, loglog=TRUE, ylim=c(-5,-1))
plot(f, age=ages, sex=NA, time=3)
# Having x=TRUE, y=TRUE in fit also allows computation of influence stats
r <- resid(f, "dfbetas")
which.influence(f)
# Use survest to estimate 3-year survival probability and
# confidence limits for selected subjects
survest(f, expand.grid(age=c(20,40,60), sex=c('Female','Male')),
        times=c(2,4,6), conf.int=.95)


# Create an S function srv that computes fitted
# survival probabilities on demand, for non-interaction model
f <- cph(Srv ~ rcs(age,4)+strat(sex), surv=TRUE)
srv <- Survival(f)
# Define functions to compute 3-year estimates as a function of
# the linear predictors (X*Beta)
surv.f <- function(lp) srv(3, lp, stratum="sex=Female")
surv.m <- function(lp) srv(3, lp, stratum="sex=Male")
# Create a function that computes quantiles of survival time
# on demand
quant <- Quantile(f)
# Define functions to compute median survival time
med.f <- function(lp) quant(.5, lp, stratum="sex=Female")
med.m <- function(lp) quant(.5, lp, stratum="sex=Male")
# Draw a nomogram to compute several types of predicted values
nomogram(f, fun=list(surv.m, surv.f, med.m, med.f),
         funlabel=c("S(3 | Male)","S(3 | Female)",
                    "Median (Male)","Median (Female)"),
         fun.at=list(c(.8,.9,.95,.98,.99),c(.1,.3,.5,.7,.8,.9,.95,.98),
                   c(8,12),c(1,2,4,8,12)))
options(datadist=NULL)

########################################################
# Simple examples using small datasets for checking    #
# calculations across different systems in which random#
# number generators cannot be synchronized.            #
########################################################

x1 <- 1:20
x2 <- abs(x1-10)
x3 <- factor(rep(0:2,length.out=20))
y  <- c(rep(0:1,8),1,1,1,1)
dd <- datadist(x1,x2,x3)
options(datadist='dd')
f  <- lrm(y ~ rcs(x1,3) + x2 + x3)
f
specs(f, TRUE)
anova(f)
anova(f, x1, x2)
plot(anova(f))
s <- summary(f)
s
plot(s, log=TRUE)
par(mfrow=c(2,2))
plot(f)
par(mfrow=c(1,1))
nomogram(f)
g <- Function(f)
g(11,7,'1')
contrast(f, list(x1=11,x2=7,x3='1'), list(x1=10,x2=6,x3='2'))
fastbw(f)
gendata(f, x1=1:5)
# w <- latex(f)

f <- update(f, x=TRUE,y=TRUE)
which.influence(f)
residuals(f,'gof')
robcov(f)$var
validate(f, B=10)
cal <- calibrate(f, B=10)
plot(cal)

f <- ols(y ~ rcs(x1,3) + x2 + x3, x=TRUE, y=TRUE)
anova(f)
anova(f, x1, x2)
plot(anova(f))
s <- summary(f)
s
plot(s, log=TRUE)
par(mfrow=c(2,2))
plot(f)
par(mfrow=c(1,1))
nomogram(f)
g <- Function(f)
g(11,7,'1')
contrast(f, list(x1=11,x2=7,x3='1'), list(x1=10,x2=6,x3='2'))
fastbw(f)
gendata(f, x1=1:5)
# w <- latex(f)

f <- update(f, x=TRUE,y=TRUE)
which.influence(f)
residuals(f,'dfbetas')
robcov(f)$var
validate(f, B=10)
cal <- calibrate(f, B=10)
plot(cal)

S <- Surv(c(1,4,2,3,5,8,6,7,20,18,19,9,12,10,11,13,16,14,15,17))
survplot(survfit(S ~ x3))
f <- psm(S ~ rcs(x1,3)+x2+x3, x=TRUE,y=TRUE)
f
# NOTE: LR chi-sq of 39.67 disagrees with that from old survreg
# and old psm (77.65); suspect were also testing sigma=1

for(w in c('survival','hazard'))
 print(survest(f, data.frame(x1=7,x2=3,x3='1'), 
       times=c(5,7), conf.int=.95, what=w))
# S-Plus 2000 using old survival library:
#  S(t):.925 .684 SE:0.729 0.556 Hazard:0.0734 0.255

plot(f, x1=NA, time=5)
f$var
set.seed(3)
# robcov(f)$var when score residuals implemented
bootcov(f, B=30)$var
validate(f, B=10)
cal <- calibrate(f, u=5, B=10, m=10)
plot(cal)
r <- resid(f)
survplot(r)

f <- cph(S ~ rcs(x1,3)+x2+x3, x=TRUE,y=TRUE,surv=TRUE,time.inc=5)
f
plot(f, x1=NA, time=5)
robcov(f)$var
bootcov(f, B=10)
validate(f, B=10)
cal <- calibrate(f, u=5, B=10, m=10)
survplot(f, x1=c(2,19))
options(datadist=NULL)
}

\section{Common Problems to Avoid}{
\enumerate{
\item Don't have a formula like \code{y ~ age + age^2}.
   In S you need to connect related variables using
   a function which produces a matrix, such as \code{pol} or
   \code{rcs}.
   This allows effect estimates (e.g., hazard ratios)
   to be computed as well as multiple d.f. tests of
   association.

\item Don't use \code{poly} or \code{strata} inside formulas used in
   Design.  Use \code{pol} and \code{strat} instead.


\item Almost never code your own dummy variables or
   interaction variables in S.  Let S do this
   automatically.  Otherwise, \code{anova} can't do its
   job.

\item Almost never transform predictors outside of
   the model formula, as then plots of predicted
   values vs. predictor values, and other displays,
   would not be made on the original scale.  Use
   instead something like \code{y ~ log(cell.count+1)},
   which will allow \code{cell.count} to appear on
   \eqn{x}-axes.  You can get fancier, e.g.,
   \code{y ~ rcs(log(cell.count+1),4)} to fit a restricted
   cubic spline with 4 knots in \code{log(cell.count+1)}.
   For more complex transformations do something
   like \cr
   \code{f <- function(x) \{}\cr
   \code{\ldots various 'if' statements, etc.}\cr
   \code{log(pmin(x,50000)+1)}\cr
   \code{\}}\cr
   \code{fit1 <- lrm(death ~ f(cell.count))}\cr
   \code{fit2 <- lrm(death ~ rcs(f(cell.count),4))}\cr
   \code{\}}

\item Don't put \code{$} inside variable names used in formulas.
   Either attach data frames or use \code{data=}.

\item Don't forget to use \code{datadist}.  Try to use it
   at the top of your program so that all model fits
   can automatically take advantage if its
   distributional summaries for the predictors.

\item Don't \code{validate} or \code{calibrate} models which were
   reduced by dropping "insignificant" predictors.
   Proper bootstrap or cross-validation must repeat
   any variable selection steps for each re-sample.
   Therefore, \code{validate} or \code{calibrate} models
   which contain all candidate predictors, and if
   you must reduce models, specify the option
   \code{bw=TRUE} to \code{validate} or \code{calibrate}.

\item Dropping of "insignificant" predictors ruins much
   of the usual statistical inference for
   regression models (confidence limits, standard
   errors, \eqn{P}-values, \eqn{\chi^2}{chi-squares}, ordinary indexes of
   model performance) and it also results in models
   which will have worse predictive discrimination.
 }
 }

\section{Accessing the Library}{
If you are using any of Design's survival analysis functions, create a
file called \code{.Rprofile} in your working directory that contains the
line \code{library(survival)}.  That way, survival will move down the
search list as Hmisc and Design are attached during your session.   This
will allow Hmisc and Design to override some of the survival function such as
\code{survfit}.

Since the Design library has a \code{.First.lib} function,
that function will be executed by the \code{library}
command, to dynamically load the \code{.o} or \code{.obj} files.  You
may want to create a \code{.First} function such as

\code{.First <- \{}\cr
\code{options(na.action = "na.delete")}\cr
\code{# gives more info than na.omit}\cr
\code{library(Hmisc)}\cr
\code{library(Design)}\cr
\code{invisible()}\cr
\code{\}}
}

\references{
The primary resource for the Design library is
\emph{Regression Modeling Strategies} by
   FE Harrell (Springer-Verlag, 2001) and the web pages
   \url{http://biostat.mc.vanderbilt.edu/rms} and
   \url{http://biostat.mc.vanderbilt.edu/s/Design.html}.  See also
   the Statistics in Medicine articles by Harrell \emph{et al} listed
   below for case studies of modeling and model validation using Design.
   Also see the free book by Alzola and Harrell at
   \url{http://biostat.mc.vanderbilt.edu}.

Several datasets useful for multivariable modeling with
Design are found at
\url{http://biostat.mc.vanderbilt.edu/s/data}.
}

\section{Published Applications of Design and Regression Splines}{
  \itemize{
	\item Spline fits
	\enumerate{
	  \item Spanos A, Harrell FE, Durack DT (1989): Differential
	  diagnosis of acute meningitis: An analysis of the
	  predictive value of initial observations.  \emph{JAMA}
	  2700-2707.

	  \item Ohman EM, Armstrong PW, Christenson RH, \emph{et al}. (1996):
	  Cardiac troponin T levels for risk stratification in
	  acute myocardial ischemia.  \emph{New Eng J Med} 335:1333-1341.
  }

  \item Bootstrap calibration curve for a parametric survival
  model:
  \enumerate{
	\item Knaus WA, Harrell FE, Fisher CJ, Wagner DP, \emph{et al}.
	(1993):  The clinical evaluation of new drugs for
	sepsis: A prospective study design based on survival
	analysis.  \emph{JAMA} 270:1233-1241.
}

\item Splines, interactions with splines, algebraic form of
  fitted model from \code{latex.Design}
  \enumerate{
	\item Knaus WA, Harrell FE, Lynn J, et al. (1995): The
	SUPPORT prognostic model: Objective estimates of
	survival for seriously ill hospitalized adults.  \emph{Annals
	of Internal Medicine} 122:191-203.
}

\item Splines, odds ratio chart from fitted model with
  nonlinear and interaction terms, use of \code{transcan} for
  imputation
  \enumerate{
\item Lee KL, Woodlief LH, Topol EJ, Weaver WD, Betriu A.
Col J, Simoons M, Aylward P, Van de Werf F, Califf RM.
Predictors of 30-day mortality in the era of
reperfusion for acute myocardial infarction: results
from an international trial of 41,021 patients.
\emph{Circulation} 1995;91:1659-1668.
}

\item Splines, external validation of logistic models,
prediction rules using point tables
\enumerate{
\item Steyerberg EW, Hargrove YV, \emph{et al} (2001): Residual mass
histology in testicular cancer: development and
validation of a clinical prediction rule.  \emph{Stat in Med}
2001;20:3847-3859.
\item van Gorp MJ, Steyerberg EW, \emph{et al} (2003): Clinical
prediction rule for 30-day mortality in Bjork-Shiley convexo-concave
valve replacement.  \emph{J Clinical Epidemiology} 2003;56:1006-1012.
}

\item Model fitting, bootstrap validation, missing value
imputation
\enumerate{
\item Krijnen P, van Jaarsveld BC, Steyerberg EW, Man in 't
Veld AJ, Schalekamp, MADH, Habbema JDF (1998): A
clinical prediction  rule for renal artery stenosis.
\emph{Annals of Internal Medicine} 129:705-711.
}

\item Model fitting, splines, bootstrap validation,
nomograms
\enumerate{
  \item Kattan MW, Eastham JA, Stapleton AMF, Wheeler TM,
  Scardino PT.  A preoperative nomogram for disease
  recurrence following radical prostatectomy for
  prostate cancer.  \emph{J Natl Ca Inst} 1998;
  90(10):766-771.
 
  \item Kattan, MW, Wheeler TM, Scardino PT.  A
  postoperative nomogram for disease recurrence
  following radical prostatectomy for prostate
  cancer. \emph{J Clin Oncol} 1999; 17(5):1499-1507

  \item Kattan MW, Zelefsky MJ, Kupelian PA, Scardino PT, 
  Fuks Z, Leibel SA.  A pretreatment nomogram for
  predicting the outcome of three-dimensional
  conformal radiotherapy in prostate cancer.  
  \emph{J Clin Oncol} 2000; 18(19):3252-3259.
 
  \item Eastham JA, May R, Robertson JL, Sartor O, Kattan
  MW.  Development of a nomogram which predicts the
  probability of a positive prostate biopsy in men
  with an abnormal digital rectal examination and a
  prostate specific antigen between 0 and 4
  ng/ml. \emph{Urology}. (In press).
  
  \item Kattan MW, Heller G, Brennan MF.  A competing-risk
  nomogram fir sarcoma-specific death following local recurrence.
  \emph{Stat in Med} 2003; 22; 3515-3525.
}

\item Nomogram with 2- and 5-year survival probability and median survival
time (but watch out for the use of univariable screening)
\enumerate{
\item Clark TG, Stewart ME, Altman DG, Smyth JF.  A prognostic
model for ovarian cancer.  \emph{Br J Cancer} 2001; 85:944-52.
}

\item Comprehensive example of parametric survival modeling
with an extensive nomogram, time ratio chart, anova
chart, survival curves generated using survplot,
bootstrap calibration curve
\enumerate{
\item Teno JM, Harrell FE, Knaus WA, et al.  Prediction of
survival for older hospitalized patients: The HELP
survival model.  \emph{J Am Geriatrics Soc} 2000;
48: S16-S24.
}

\item Model fitting, imputation, and several nomograms
expressed in tabular form
\enumerate{
\item Hasdai D, Holmes DR, et al.  Cardiogenic shock complicating
acute myocardial infarction: Predictors of death.
\emph{Am Heart J} 1999; 138:21-31.
}

\item Ordinal logistic model with bootstrap calibration plot
\enumerate{
  \item Wu AW, Yasui U, Alzola CF \emph{et al}.  Predicting functional
  status outcomes in hospitalized patients aged 80 years and
  older.  \emph{J Am Geriatric Society} 2000; 48:S6-S15.
}

\item Propensity modeling in evaluating medical diagnosis, anova
dot chart
\enumerate{
  \item Weiss JP, Gruver C, et al.  Ordering an echocardiogram 
  for evaluation of left ventricular function: Level
  of expertise necessary for efficient use. \emph{J Am Soc 
  Echocardiography} 2000; 13:124-130.
}

\item Simulations using Design to study the properties
of various modeling strategies
\enumerate{
  \item Steyerberg EW, Eijkemans MJC, Habbema JDF.  Stepwise selection
  in small data sets: A simulation study of bias in logistic
  regression analysis.  \emph{J Clin Epi} 1999; 52:935-942.

  \item Steyerberg WE, Eijekans MJC, Harrell FE, Habbema JDF.
  Prognostic modeling with logistic regression analysis: In
  search of a sensible strategy in small data sets.  \emph{Med
  Decision Making} 2001; 21:45-56.
}

\item Statistical methods and
references related to Design, along with case studies
which includes the Design code which produced the
analyses
\enumerate{
  \item Harrell FE, Lee KL, Mark DB (1996): Multivariable
  prognostic models: Issues in developing models,
  evaluating assumptions and adequacy, and measuring and
  reducing errors.  \emph{Stat in Med} 15:361-387.

  \item Harrell FE, Margolis PA, Gove S, Mason KE, Mulholland
  EK et al. (1998): Development of a clinical prediction
  model for an ordinal outcome: The World Health
  Organization ARI Multicentre Study of clinical signs
  and etiologic agents of pneumonia, sepsis, and
  meningitis in young infants. \emph{Stat in Med} 17:909-944.

  \item Bender R, Benner, A (2000): Calculating ordinal regression
  models in SAS and S-Plus.  \emph{Biometrical J} 42:677-699.
}
}}

\section{Bug Reports}{
The author is willing to help with problems.  Send
E-mail to \email{f.harrell@vanderbilt.edu}.  To report bugs,
please do the following:

\enumerate{
\item If the bug occurs when running a function on a fit
   object (e.g., \code{anova}), attach a \code{dump}'d text
   version of the fit object to your note.  If you
   used \code{datadist} but not until after the fit was
   created, also send the object created by
   \code{datadist}.  Example: \code{dump("myfit","/tmp/dumpdata")} will create
   a text file called \code{"dumpdata"} that can be
   attached to the E-mail.  
\item If the bug occurs during a model fit (e.g., with
   \code{lrm, ols, psm, cph}), send the statement causing
   the error with a \code{dump}'d version of the data
   frame used in the fit.  If this data frame is very
   large, reduce it to a small subset which still
   causes the error.
 }
 }

\section{Copyright Notice}{
GENERAL DISCLAIMER  This program is free software;
you can redistribute it and/or modify it under the
terms of the GNU General Public License as
published by the Free Software Foundation; either
version 2, or (at your option) any later version.

This program is
distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE.  See the GNU General Public
License for more details.   In short: you may
use this code any way you like, as long as you don't charge
money for it, remove this notice, or hold anyone
liable for its results.  Also, please acknowledge
the source and communicate changes to the author.

If this software is used is work presented for
publication, kindly reference it using for example:
Harrell FE (2003): Design: S functions for
biostatistical/epidemiologic modeling, testing,
estimation, validation, graphics, and prediction.
Programs available from
\url{biostat.mc.vanderbilt.edu/s/Design.html}.
Be sure to reference other libraries used as well as S-Plus
or \R itself.
}

\section{Acknowledgements}{This work was supported by grants
  from the Agency for Health Care Policy and Research
  (US Public Health Service) and the Robert Wood
  Johnson Foundation.
  }

\author{
Frank E Harrell Jr\cr
Professor of Biostatistics\cr
Chair, Department of Biostatistics\cr
Vanderbilt University School of Medicine\cr
Nashville, Tennessee\cr
\email{f.harrell@vanderbilt.edu}
}
\keyword{models}
\concept{overview}

\eof
\name{anova.Design}
\alias{anova.Design}
\alias{print.anova.Design}
\alias{text.anova.Design}
\alias{plot.anova.Design}
\alias{latex.anova.Design}
\title{
Analysis of Variance (Wald and F Statistics)
}
\description{
The \code{anova} function automatically tests most meaningful hypotheses in
a design. For example, suppose that age and cholesterol are
predictors, and that a general interaction is modeled using a
restricted spline surface. \code{anova} prints Wald statistics (\eqn{F}
statistics for an \code{ols} fit) for testing
linearity of age, linearity of cholesterol, age effect (age + age by
cholesterol interaction), cholesterol effect (cholesterol + age by
cholesterol interaction), linearity of the age by cholesterol
interaction (i.e., adequacy of the simple age * cholesterol 1
d.f. product), linearity of the interaction in age alone, and
linearity of the interaction in cholesterol alone. Joint tests of all
interaction terms in the model and all nonlinear terms in the model
are also performed.  For any multiple d.f. effects for continuous
variables that were not modeled through \code{rcs}, \code{pol},
\code{lsp}, etc., 
tests of linearity will be omitted.  This applies to matrix predictors
produced by e.g.  \code{poly} or \code{ns}.  \code{print.anova.Design} is the
printing method.  \code{text.anova.Design} is the \code{text} method for
inserting anova tables on graphs.  \code{plot.anova.Design} draws dot
charts depicting the importance of variables in the model, as measured
by Wald \eqn{\chi^2}{chi-square}, \eqn{\chi^2}{chi-square} minus d.f., AIC, \eqn{P}-values, partial \eqn{R^2},
\eqn{R^2} for the whole model after deleting the effects in question, or
proportion of overall model \eqn{R^2} that is due to each predictor. 
\code{latex.anova.Design} is the \code{latex} method.  It substitutes
Greek/math symbols in column headings, uses boldface for \code{TOTAL}
lines, and constructs a caption.  Then it passes the result to
\code{latex.default} for conversion to LaTeX.
}
\usage{
\method{anova}{Design}(object, \ldots, main.effect=FALSE, tol=1e-9, 
      test=c('F','Chisq'), ss=TRUE)

\method{print}{anova.Design}(x, which=c('none','subscripts','names','dots'), \dots)

\method{plot}{anova.Design}(x, 
     what=c("chisqminusdf","chisq","aic","P","partial R2","remaining R2",
            "proportion R2"), 
     xlab=NULL, pch=16, 
     rm.totals=TRUE, rm.ia=FALSE, rm.other=NULL, newnames,
     sort=c("descending","ascending","none"), pl=TRUE, \dots)

\method{text}{anova.Design}(x, at, cex=.5, font=2, \dots)

\method{latex}{anova.Design}(object, title, psmall=TRUE, 
      dec.chisq=2, dec.F=2, dec.ss=NA, dec.ms=NA, dec.P=4, \dots)
}
\arguments{
\item{object}{
a \code{Design} fit object.  \code{object} must
allow \code{Varcov} to return the variance-covariance matrix.  For
\code{latex}, is the result of \code{anova}.
}
\item{\dots}{
If omitted, all variables are tested, yielding tests for individual factors
and for pooled effects. Specify a subset of the variables to obtain tests
for only those factors, with a pooled Wald tests for the combined effects
of all factors listed. Names may be abbreviated.  For example, specify
\code{anova(fit,age,cholesterol)} to get a Wald statistic for testing the joint
importance of age, cholesterol, and any factor interacting with them.

Can be optional graphical parameters to send to \code{text} or
\code{dotchart2}, or other parameters to send to \code{latex.default}.
Ignored for \code{print}.
}
\item{main.effect}{
Set to \code{TRUE} to print the (usually meaningless) main effect tests even when
the factor is involved in an interaction. The default is \code{FALSE}, to print only
the effect of the main effect combined with all interactions involving that
factor.
}
\item{tol}{
singularity criterion for use in matrix inversion
}
\item{test}{
For an \code{ols} fit, set \code{test="Chisq"} to use Wald \eqn{\chi^2} tests rather than F-tests.
}
\item{ss}{
For an \code{ols} fit, set \code{ss=FALSE} to suppress printing partial sums of squares, mean
squares, and the Error SS and MS.
}
\item{x}{for \code{print,plot,text} is the result of \code{anova}.
}
\item{which}{
If \code{which} is not \code{"none"} (the default), \code{print.anova.Design} will
add to the rightmost column of the output the list of parameters being
tested by the hypothesis being tested in the current row.  Specifying
\code{which="subscripts"} causes the subscripts of the regression
coefficients being tested to be printed (with a subscript of one for
the first non-intercept term).  \code{which="names"} prints the names of
the terms being tested, and \code{which="dots"} prints dots for terms being
tested and blanks for those just being adjusted for.
}
\item{at}{
for \code{text} is a list containing the x- and y-coordinates for the
upper left corner of the anova table to be drawn on an existing plot,
e.g. \code{at=locator(1)}
  }
\item{cex}{
character expansion size for \code{text.anova.Design}
}
\item{font}{
font for \code{text.anova.Design}.  Default is 2 (usually Courier).
}
\item{what}{
what type of statistic to plot.  The default is the Wald
\eqn{\chi^2}{chi-square} 
statistic for each factor (adding in the effect of higher-ordered
factors containing that factor) minus its degrees of freedom.  The
last three choice for \code{what} only apply to \code{ols} models.
}
\item{xlab}{
x-axis label, default is constructed according to \code{what}.
\code{plotmath} symbols are used for \R, by default.
}
\item{pch}{
character for plotting dots in dot charts.  Default is 16 (solid dot).
}
\item{rm.totals}{
set to \code{FALSE} to keep total \eqn{\chi^2}{chi-square}s (overall, nonlinear, interaction totals)
in the chart.
}
\item{rm.ia}{
set to \code{TRUE} to omit any effect that has \code{"*"} in its name
}
\item{rm.other}{
a list of other predictor names to omit from the chart
}
\item{newnames}{
a list of substitute predictor names to use, after omitting any.
}
\item{sort}{
default is to sort bars in descending order of the summary statistic
}
\item{pl}{
set to \code{FALSE} to suppress plotting.  This is useful when you only wish to
analyze the vector of statistics returned.
}
\item{title}{
title to pass to \code{latex}, default is name of fit object passed to \code{anova}
prefixed with \code{"anova."}.  For Windows, the default is \code{"ano"} followed
by the first 5 letters of the name of the fit object.
}
\item{psmall}{
The default is \code{psmall=TRUE}, which causes \code{P<0.00005} to print as \code{<0.0001}.
Set to \code{FALSE} to print as \code{0.0000}.
}
\item{dec.chisq}{
number of places to the right of the decimal place for typesetting
\eqn{\chi^2}{chi-square} values (default is \code{2}).  Use zero for integer, \code{NA} for
floating point.
}
\item{dec.F}{
digits to the right for \eqn{F} statistics (default is \code{2})
}
\item{dec.ss}{
digits to the right for sums of squares (default is \code{NA}, indicating
floating point)
}
\item{dec.ms}{
digits to the right for mean squares (default is \code{NA})
}
\item{dec.P}{digits to the right for \eqn{P}-values}
}
\value{
\code{anova.Design} returns a matrix of class \code{anova.Design} containing factors 
as rows and \eqn{\chi^2}{chi-square}, d.f., and \eqn{P}-values as
columns (or d.f., partial \eqn{SS, MS, F, P}).
\code{plot.anova.Design} invisibly returns the vector of quantities
plotted.  This vector has a names attribute describing the terms for
which the statistics in the vector are calculated.
}
\details{
If the statistics being plotted with \code{plot.anova.Design} are few in
number and one of them is negative or zero, \code{plot.anova.Design}
will quit because of an error in \code{dotchart2}.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\section{Side Effects}{
\code{print} prints, \code{text} uses \code{tempfile} to get a temporary Unix file name,
\code{sink}, and \code{unix} (to remove the temporary file).  \code{latex} creates a
file with a name of the form \code{"title.tex"} (see the \code{title} argument above).
}
\seealso{
\code{\link{Design}}, \code{\link{Design.Misc}}, \code{\link{lrtest}}, \code{\link{Design.trans}}, \code{\link{summary.Design}}, \code{\link[Hmisc]{solvet}}, 
\code{\link{text}}, \code{\link{locator}}, \code{\link[Hmisc]{dotchart2}}, \code{\link[Hmisc]{latex}}, 
\code{\link[Hmisc]{Dotplot}}, \code{\link{anova.lm}}, \code{\link{contrast.Design}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
treat <- factor(sample(c('a','b','c'), n,TRUE))
num.diseases <- sample(0:4, n,TRUE)
age <- rnorm(n, 50, 10)
cholesterol <- rnorm(n, 200, 25)
weight <- rnorm(n, 150, 20)
sex <- factor(sample(c('female','male'), n,TRUE))
label(age) <- 'Age'      # label is in Hmisc
label(num.diseases) <- 'Number of Comorbid Diseases'
label(cholesterol) <- 'Total Cholesterol'
label(weight) <- 'Weight, lbs.'
label(sex) <- 'Sex'
units(cholesterol) <- 'mg/dl'   # uses units.default in Hmisc


# Specify population model for log odds that Y=1
L <- .1*(num.diseases-2) + .045*(age-50) +
     (log(cholesterol - 10)-5.2)*(-2*(treat=='a') +
     3.5*(treat=='b')+2*(treat=='c'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


fit <- lrm(y ~ treat + scored(num.diseases) + rcs(age) +
               log(cholesterol+10) + treat:log(cholesterol+10))
anova(fit)                       # Test all factors
anova(fit, treat, cholesterol)   # Test these 2 by themselves
                                 # to get their pooled effects
g <- lrm(y ~ treat*rcs(age))
dd <- datadist(treat, num.diseases, age, cholesterol)
options(datadist='dd')
plot(g, age=NA, treat="b")
s <- anova(g)
print(s)
#p <- locator(1)                  # click mouse at upper left corner of table
p <- list(x=32,y=2.1)
text(s, at=p)                    # add anova table to regression plot
plot(s)                          # new plot - dot chart of chisq-d.f.
# latex(s)                       # nice printout - creates anova.g.tex
options(datdist=NULL)



# Simulate data with from a given model, and display exactly which
# hypotheses are being tested


set.seed(123)
age <- rnorm(500, 50, 15)
treat <- factor(sample(c('a','b','c'), 500,TRUE))
bp  <- rnorm(500, 120, 10)
y   <- ifelse(treat=='a', (age-50)*.05, abs(age-50)*.08) + 3*(treat=='c') +
       pmax(bp, 100)*.09 + rnorm(500)
f   <- ols(y ~ treat*lsp(age,50) + rcs(bp,4))
print(names(coef(f)), quote=FALSE)
specs(f)
anova(f)
an <- anova(f)
options(digits=3)
print(an, 'subscripts')
print(an, 'dots')


an <- anova(f, test='Chisq', ss=FALSE)
plot(0:1)                        # make some plot
text(an, at=list(x=1.5,y=.6))    # add anova table to plot
plot(an)                         # new plot - dot chart of chisq-d.f.
# latex(an)                      # nice printout - creates anova.f.tex


# Suppose that a researcher wants to make a big deal about a variable 
# because it has the highest adjusted chi-square.  We use the
# bootstrap to derive 0.95 confidence intervals for the ranks of all
# the effects in the model.  We use the plot method for anova, with
# pl=FALSE to suppress actual plotting of chi-square - d.f. for each
# bootstrap repetition.  We rank the negative of the adjusted
# chi-squares so that a rank of 1 is assigned to the highest.
# It is important to tell plot.anova.Design not to sort the results,
# or every bootstrap replication would have ranks of 1,2,3 for the stats.


mydata <- data.frame(x1=runif(200), x2=runif(200),
                     sex=factor(sample(c('female','male'),200,TRUE)))
set.seed(9)  # so can reproduce example
mydata$y <- ifelse(runif(200)<=plogis(mydata$x1-.5 + .5*(mydata$x2-.5) + 
                   .5*(mydata$sex=='male')),1,0)


if(.R.) {
library(boot)
b <- boot(mydata, function(data, i, ...) rank(-plot(anova(
                lrm(y ~ rcs(x1,4)+pol(x2,2)+sex,data,subset=i)), 
                sort='none', pl=FALSE)),
                R=25)  # should really do R=500 but will take a while
Rank <- b$t0
lim <- t(apply(b$t, 2, quantile, probs=c(.025,.975)))
} else {
b <- bootstrap(mydata, rank(-plot(anova(
                lrm(y ~ rcs(x1,4)+pol(x2,2)+sex,mydata)), sort='none', pl=FALSE)),
               B=25)  # should really do B=500 but will take a while
Rank <- b$observed
lim <- limits.emp(b)[,c(1,4)]  # get 0.025 and 0.975 quantiles
}


# Use the Hmisc Dotplot function to display ranks and their confidence
# intervals.  Sort the categories by descending adj. chi-square, for ranks
original.chisq <- plot(anova(lrm(y ~ rcs(x1,4)+pol(x2,2)+sex,data=mydata)),
                       sort='none', pl=FALSE)
predictor <- as.factor(names(original.chisq))
predictor <- reorder.factor(predictor, -original.chisq)

Dotplot(predictor ~ Cbind(Rank, lim), pch=3, xlab='Rank', 
		main=if(.R.) expression(paste(
'Ranks and 0.95 Confidence Limits for ',chi^2,' - d.f.')) else
'Ranks and 0.95 Confidence Limits for Chi-square - d.f.')
}
\keyword{models}
\keyword{regression}
\keyword{htest}
\keyword{aplot}
\concept{bootstrap}



\eof
\name{bj}
\alias{bj}
\alias{bj.fit}
\alias{residuals.bj}
\alias{print.bj}
\alias{validate.bj}
\alias{bjplot}
\title{
Buckley-James Multiple Regression Model
}
\description{
\code{bj} fits the Buckley-James distribution-free least squares multiple
regression model to a possibly right-censored response variable.  
This model reduces to ordinary least squares if
there is no censoring.  By default, model fitting is done after
taking logs of the response variable.
\code{bj} uses the \code{Design} class
for automatic \code{anova}, \code{fastbw}, \code{validate}, \code{Function}, \code{nomogram},
\code{summary}, \code{plot}, \code{bootcov}, and other functions.  The \code{bootcov}
function may be worth using with \code{bj} fits, as the properties of the
Buckley-James covariance matrix estimator are not fully known for
strange censoring patterns.

The \code{residuals.bj} function exists mainly to compute 
residuals and to censor them (i.e., return them as
\code{Surv} objects) just as the original
failure time variable was censored.  These residuals are useful for
checking to see if the model also satisfies certain distributional assumptions.
To get these residuals, the fit must have specified \code{y=TRUE}.

The \code{bjplot} function is a special plotting function for objects
created by \code{bj} with \code{x=TRUE, y=TRUE} in effect.  It produces three
scatterplots for every covariate in the model: the first plots the
original situation, where censored data are distingushed from
non-censored data by a different plotting symbol. In the second plot,
called a renovated plot, vertical lines show how censored data were
changed by the procedure, and the third is equal to the second, but
without vertical lines.  Imputed data are again distinguished from the
non-censored by a different symbol.

The \code{validate} method for \code{bj} validates the Somers' \code{Dxy} rank
correlation between predicted and observed responses, accounting for censoring.

The primary fitting function for \code{bj} is \code{bj.fit}, which does not
allow missing data and expects a full design matrix as input.
}
\usage{
bj(formula=formula(data), data, subset, na.action=na.delete,
   link="log", control, method='fit', x=FALSE, y=FALSE, 
   time.inc)

\method{print}{bj}(x, digits=4, long=FALSE, \dots)

\method{predict}{bj}{\dots}

\method{residuals}{bj}(object, type=c("censored","censored.normalized"),\dots)

bjplot(fit, which=1:dim(X)[[2]])

\method{validate}{bj}(fit, method="boot", B=40,
         bw=FALSE,rule="aic",type="residual",sls=.05,aics=0,pr=FALSE,
		 dxy=TRUE, tol=1e-7, rel.tolerance=1e-3, maxiter=15, \dots)

bj.fit(x, y, control)
}
\arguments{
\item{formula}{
an S statistical model formula. Interactions up to third order are
supported. The left hand side must be a \code{Surv} object.
}
\item{data}{}
\item{subset}{}
\item{na.action}{the usual statistical model fitting arguments}
\item{fit}{
a fit created by \code{bj}, required for all functions except \code{bj}.
}
\item{x}{
a design matrix with or without a first column of ones, to pass
to \code{bj.fit}.  All models will have an intercept.  For
\code{print.bj} is a result of \code{bj}.  For \code{bj}, set
\code{x=TRUE} to include the design matrix in the fit object. 
}
\item{y}{
a \code{Surv} object to pass to \code{bj.fit} as the two-column response
variable.  Only right censoring is allowed, and there need not be any
censoring.  For \code{bj}, set \code{y} to \code{TRUE} to include the
two-column response matrix, with the 
event/censoring indicator in the second column.  The first column will
be transformed according to \code{link}, and depending on
\code{na.action}, rows with missing data in the predictors or the
response will be deleted.
}
\item{link}{
set to, for example, \code{"log"} (the default) to model the log of the
response, or \code{"identity"} to model the untransformed response.
}
\item{control}{
a list containing any or all of the following components: \code{iter.max}
(maximum number of iterations allowed, default is 20),
\code{eps} (convergence criterion: concergence is assumed when the ratio of
sum of squared errors from one iteration to the next is between
1-\code{eps} and 1+\code{eps}), \code{trace} (set to \code{TRUE} to monitor iterations), 
\code{tol} (matrix singularity criterion, default is 1e-7), and 'max.cycle' 
(in case of nonconvergence the program looks for a cycle that repeats itself, 
default is 30).  
}
\item{method}{
set to \code{"model.frame"} or \code{"model.matrix"} to return one of those
objects rather than the model fit.
}
\item{dxy}{set to \code{FALSE} to prevent Somers' \eqn{D_{xy}} from
  being computed by \code{validate} (saves time for very large datasets)}
\item{time.inc}{
setting for default time spacing.
Default is 30 if time variable has \code{units="Day"}, 1 otherwise, unless
maximum follow-up time \eqn{< 1}. Then max time/10 is used as \code{time.inc}.
If \code{time.inc} is not given and max time/default \code{time.inc} is
\eqn{> 25}, \code{time.inc} is increased.
}
\item{digits}{
number of significant digits to print if not 4.
}
\item{long}{
set to \code{TRUE} to print the correlation matrix for parameter estimates
}
\item{object}{the result of \code{bj}}
\item{type}{
type of residual desired.  Default is censored unnormalized residuals,
defined as link(Y) - linear.predictors, where the
link function was usually the log function.  You can specify
\code{type="censored.normalized"} to divide the residuals by the estimate
of \code{sigma}.
}
\item{which}{
vector of integers or character strings naming elements of the design
matrix (the names of the original predictors if they entered the model
linearly) for which to have \code{bjplot} make plots of only the variables listed in \code{which} (names or numbers).
}
\item{B}{}
\item{bw}{}
\item{rule}{}
\item{sls}{}
\item{aics}{}
\item{pr}{}
\item{tol}{}
\item{rel.tolerance}{}
\item{maxiter}{see \code{\link{predab.resample}}}
\item{\dots}{ignored for \code{print}; passed through to
  \code{predab.resample} for \code{validate}}
}
\value{
\code{bj} returns a fit object with similar information to what \code{survreg},
\code{psm}, \code{cph} would store as 
well as what \code{Design} stores and \code{units} and \code{time.inc}.
\code{residuals.bj} returns a \code{Surv} object.  One of the components of the
\code{fit} object produced by \code{bj} (and \code{bj.fit}) is a vector called
\code{stats} which contains the following names elements: 
\code{"Obs", "Events", "d.f.","error d.f.","sigma"}.  Here \code{sigma} is the
estimate of the residual standard deviation.
}
\details{
The program implements the algorithm as described in the original
article by Buckley & James. Also, we have used the original Buckley &
James prescription for computing variance/covariance estimator.  This
is based on non-censored observations only and does not have any
theoretical justification, but has been shown in simulation studies to
behave well. Our experience confirms this view.  Convergence is rather
slow with this method, so you may want to increase the number of
iterations.  Our experience shows that often, in particular with high
censoring, 100 iterations is not too many. Sometimes the method will not converge,
but will instead enter a loop of repeating values (this is due to the discrete nature 
of Kaplan and Meier estimator and usually happens with small sample sizes).
The program will look for such a loop and return the average betas. It will also 
issue a warning message and give the size of the cycle (usually less than 6).
}
\author{
Janez Stare\cr
Department of Biomedical Informatics\cr
Ljubljana University\cr
Ljubljana, Slovenia\cr
janez.stare@mf.uni-lj.si


Harald Heinzl\cr
Department of Medical Computer Sciences\cr
Vienna University\cr
Vienna, Austria\cr
harald.heinzl@akh-wien.ac.at


Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Buckley JJ, James IR. Linear regression with censored data. Biometrika 1979; 
66:429--36.


Miller RG, Halpern J. Regression with censored data. Biometrika 1982; 69: 
521--31.


James IR, Smith PJ. Consistency results for linear regression with censored 
data. Ann Statist 1984; 12: 590--600.


Lai TL, Ying Z. Large sample theory of a modified Buckley-James estimator for 
regression analysis 
with censored data. Ann Statist 1991; 19: 1370--402.


Hillis SL. Residual plots for the censored data linear regression model.  Stat in Med 1995; 14: 2023--2036.
}
\seealso{
\code{\link{Design}}, \code{\link{psm}}, \code{\link{survreg}}, \code{\link{cph}}, \code{\link[survival]{Surv}},
\code{\link[Hmisc]{na.delete}}, \code{\link[Hmisc]{na.detail.response}}, \code{\link{datadist}}, \code{\link[Hmisc]{rcorr.cens}}.
}
\examples{
set.seed(1)
ftime  <- 10*rexp(200)
stroke <- ifelse(ftime > 10, 0, 1)
ftime  <- pmin(ftime, 10)
units(ftime) <- "Month"
age <- rnorm(200, 70, 10)
hospital <- factor(sample(c('a','b'),200,TRUE))
dd <- datadist(age, hospital)
options(datadist="dd")


f <- bj(Surv(ftime, stroke) ~ rcs(age,5) + hospital, x=TRUE, y=TRUE)
# add link="identity" to use a censored normal regression model instead
# of a lognormal one
anova(f)
fastbw(f)
validate(f, B=15)
plot(f, age=NA, hospital=NA)  # needs datadist since no explicit age,hosp.
coef(f)               # look at regression coefficients
coef(psm(Surv(ftime, stroke) ~ rcs(age,5) + hospital, dist='lognormal'))
                      # compare with coefficients from likelihood-based
                      # log-normal regression model
                      # use dist='gau' not under R 


r <- resid(f, 'censored.normalized')
survplot(survfit(r), conf='none') 
                      # plot Kaplan-Meier estimate of 
                      # survival function of standardized residuals
survplot(survfit(r ~ cut2(age, g=2)), conf='none')  
                      # may desire both strata to be n(0,1)
options(datadist=NULL)
}
\keyword{models}
\keyword{survival}
% Converted by Sd2Rd version 1.21.







\eof
\name{bootcov}
\alias{bootcov}
\alias{bootplot}
\alias{bootplot.bootcov}
\alias{confplot}
\alias{confplot.bootcov}
\alias{histdensity}
\title{
Bootstrap Covariance and Distribution for Regression Coefficients
}
\description{
\code{bootcov} computes a bootstrap estimate of the covariance matrix for a set
of regression coefficients from \code{ols}, \code{lrm}, \code{cph}, \code{psm} and any
other fit where \code{x=TRUE, y=TRUE} was used to store the data used in making
the original regression fit and where an appropriate \code{fitter} function
is provided here.  The estimates obtained are not conditional on
the design matrix, but are instead unconditional estimates.  For
small sample sizes, this will make a difference as the unconditional
variance estimates are larger.  This function will also obtain
bootstrap estimates corrected for cluster sampling (intra-cluster
correlations) when a "working independence" model was used to fit
data which were correlated within clusters.  This is done by substituting
cluster sampling with replacement for the usual simple sampling with
replacement.  \code{bootcov} has an option (\code{coef.reps}) that causes all
of the regression coefficient estimates from all of the bootstrap
re-samples to be saved, facilitating computation of nonparametric
bootstrap confidence limits and plotting of the distributions of the
coefficient estimates (using histograms and kernel smoothing estimates).


The \code{loglik} option facilitates the calculation of simultaneous
confidence regions from quantities of interest that are functions of
the regression coefficients, using the method of Tibshirani(1996).
With Tibshirani's method, one computes the objective criterion (-2 log
likelihood evaluated at the bootstrap estimate of \eqn{\beta}{beta} but with
respect to the original design matrix and response vector) for the
original fit as well as for all of the bootstrap fits.  The confidence
set of the regression coefficients is the set of all coefficients that
are associated with objective function values that are less than or
equal to say the 0.95 quantile of the vector of \code{B + 1} objective
function values.  For the coefficients satisfying this condition,
predicted values are computed at a user-specified design matrix \code{X},
and minima and maxima of these predicted values (over the qualifying
bootstrap repetitions) are computed to derive the final simultaneous
confidence band.


The \code{bootplot} function takes the output of \code{bootcov} and 
either plots a histogram and kernel density
estimate of specified regression coefficients (or linear combinations
of them through the use of a specified design matrix \code{X}), or a
\code{qqnorm} plot of the quantities of interest to check for normality of
the maximum likelihood estimates.  \code{bootplot} draws vertical lines at
specified quantiles of the bootstrap distribution, and returns these
quantiles for possible printing by the user.  Bootstrap estimates may
optionally be transformed by a user-specified function \code{fun} before
plotting.


The \code{confplot} function also uses the output of \code{bootcov} but to
compute and optionally plot nonparametric bootstrap pointwise confidence
limits or (by default) Tibshirani (1996) simultaneous confidence sets.
A design matrix must be specified to allow \code{confplot} to compute
quantities of interest such as predicted values across a range
of values or differences in predicted values (plots of effects of
changing one or more predictor variable values).


\code{bootplot} and \code{confplot} are actually generic functions, with
the particular functions \code{bootplot.bootcov} and \code{confplot.bootcov}
automatically invoked for \code{bootcov} objects.


A service function called \code{histdensity} is also provided (for use with
\code{bootplot}).  It runs \code{hist} and \code{density} on the same plot, using
twice the number of classes than the default for \code{hist}, and 1.5 times the
\code{width} than the default used by \code{density}.


A comprehensive example demonstrates the use of all of the functions.
}
\usage{
bootcov(fit, cluster, B=200, fitter, 
        coef.reps=FALSE, loglik=coef.reps,
        pr=FALSE, maxit=15, group)


bootplot(obj, which, X,
         conf.int=c(.9,.95,.99),
         what=c('density','qqnorm'),
         fun=function(x)x, labels., \dots)


confplot(obj, X, against, 
         method=c('simultaneous','pointwise'),
         conf.int=0.95, fun=function(x)x,
         add=FALSE, lty.conf=2, \dots)


histdensity(y, xlab, nclass, width, mult.width=1, \dots)
}
\arguments{
\item{fit}{
a fit object containing components \code{x} and \code{y}.  For fits from
\code{cph}, the \code{"strata"} attribute of the \code{x} component is used to
obtain the vector of stratum codes.
}
\item{obj}{
an object created by \code{bootcov} with \code{coef.reps=TRUE}.
}
\item{X}{
a design matrix specified to \code{confplot}.  See \code{predict.Design} or
\code{contrast.Design}.  For \code{bootplot}, \code{X} is optional.
}
\item{y}{
a vector to pass to \code{histdensity}.  \code{NA}s are ignored.
}
\item{cluster}{
a variable indicating groupings. \code{cluster} may be any type of vector
(factor, character, integer).
Unique values of \code{cluster} indicate
possibly correlated groupings of observations. Note the data used in
the fit and stored in \code{fit$x} and \code{fit$y} may have had observations
containing missing values deleted.  It is assumed that if there were
any NAs, an \code{naresid} function exists for the class of \code{fit}. This
function restores NAs so that the rows of the design matrix
coincide with \code{cluster}.
}
\item{B}{
number of bootstrap repetitions.  Default is 200.
}
\item{fitter}{
the name of a function with arguments \code{(x,y)} that will fit bootstrap
samples.  Default is taken from the class of \code{fit} if it is
\code{ols}, \code{lrm}, \code{cph}, \code{psm}.
}
\item{coef.reps}{
set to \code{TRUE} if you want to store a matrix of all bootstrap regression
coefficient estimates in the returned component \code{boot.Coef}.
}
\item{loglik}{
set to \code{TRUE} to store -2 log likelihoods for each bootstrap model, evaluated
against the original \code{x} and \code{y} data.  The default is to do this when
\code{coef.reps} is specified as \code{TRUE}.  The use of \code{loglik=TRUE} assumes that
an \code{oos.loglik} method exists for the type of model being analyzed,
to calculate out-of-sample -2 log likelihoods (see \code{Design.Misc}).
After the \code{B} -2 log likelihoods (stored in the element named
\code{boot.loglik} in the returned fit object), the \code{B+1} element is
the -2 log likelihood for the original model fit.
}
\item{pr}{
set to \code{TRUE} to print the current sample number to monitor progress.
}
\item{maxit}{
maximum number of iterations, to pass to \code{fitter}
}
\item{group}{
a grouping variable used to stratify the sample upon bootstrapping.
This allows one to handle k-sample problems, i.e., each bootstrap
sample will be forced to select the same number of observations from
each level of group as the number appearing in the original dataset.
You may specify both \code{group} and \code{cluster}.
}
\item{which}{
one or more integers specifying which regression coefficients to
plot for \code{bootplot}
}
\item{conf.int}{
a vector (for \code{bootplot}, default is \code{c(.9,.95,.99)}) or scalar 
(for \code{confplot}, default is \code{.95}) confidence level.
}
\item{what}{
for \code{bootplot}, specifies whether a density or a q-q plot is made
}
\item{fun}{
for \code{bootplot} or \code{confplot} specifies a function used to translate
the quantities of interest before analysis.  A common choice is
\code{fun=exp} to compute anti-logs, e.g., odds ratios.
}
\item{labels.}{
a vector of labels for labeling the axes in plots produced by \code{bootplot}.
Default is row names of \code{X} if there are any, or sequential integers.
}
\item{\dots}{
For \code{bootplot} these are optional arguments passed to
\code{histdensity}.  Also may be optional arguments passed to
\code{plot} by \code{confplot} or optional arguments passed to
\code{hist} from \code{histdensity}, such as \code{xlim} and
\code{breaks}.  The argument \code{probability=TRUE} is always passed to
\code{hist}.
}
\item{against}{
For \code{confplot}, specifying \code{against} causes a plot to be made (or added to).
The \code{against} variable is associated with rows of \code{X} and is used as the
x-coordinates.
}
\item{method}{
specifies whether \code{"pointwise"} or \code{"simultaneous"} confidence regions
are derived by \code{confplot}.  The default is simultaneous.
}
\item{add}{
set to \code{TRUE} to add to an existing plot, for \code{confplot}
}
\item{lty.conf}{
line type for plotting confidence bands in \code{confplot}.  Default is
2 for dotted lines.
}
\item{xlab}{
label for x-axis for \code{histdensity}.  Default is \code{label} attribute or
argument name if there is no \code{label}.
}
\item{nclass}{
passed to \code{hist} if present
}
\item{width}{
passed to \code{density} if present
}
\item{mult.width}{
multiplier by which to adjust the default \code{width} passed to \code{density}.
Default is 1.
}
}
\value{
a new fit object with class of the original object and with the element
\code{orig.var} added. \code{orig.var} is
the covariance matrix of the original fit.  Also, the original \code{var}
component is replaced with the new bootstrap estimates.  The component
\code{boot.coef} is also added.  This contains the mean bootstrap estimates
of regression coefficients (with a log scale element added if
applicable).  \code{boot.Coef} is added if \code{coef.reps=TRUE}.  \code{boot.loglik} is
added if \code{loglik=TRUE}.


\code{bootplot} returns a (possible matrix) of quantities of interest and
the requested quantiles of them.  \code{confplot} returns three vectors:
\code{fitted}, \code{lower}, and \code{upper}.
}
\section{Side Effects}{
\code{bootcov} prints if \code{pr=TRUE}
}
\details{
If the fit has a scale parameter (e.g., a fit from \code{psm}), the log
of the individual bootstrap scale estimates are added to the vector
of parameter estimates and and column and row for the log scale are
added to the new covariance matrix (the old covariance matrix also
has this row and column).
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
\email{f.harrell@vanderbilt.edu}\cr

Bill Pikounis\cr
Biometrics Research Department\cr
Merck Research Laboratories\cr
\email{v\_bill\_pikounis@merck.com}
}
\references{
Feng Z, McLerran D, Grizzle J (1996): A comparison of statistical methods for
clustered data analysis with Gaussian error.  Stat in Med 15:1793--1806.

Tibshirani R, Knight K (1996): Model search and inference by bootstrap 
"bumping". Department of Statistics, University of Toronto.  Technical
report available from
\cr
http://www-stat.stanford.edu/~tibs/.
Presented at the Joint Statistical Meetings,
Chicago, August 1996.
}
\seealso{
\code{\link{robcov}}, \code{\link{sample}}, \code{\link{Design}}, \code{\link{lm.fit}}, \code{\link{lrm.fit}}, \code{\link[survival]{coxph.fit}}, \code{\link[survival]{survreg.fit}},
\code{\link{predab.resample}}, \code{\link{Design.Misc}}, \code{\link{predict.Design}}, \code{\link{gendata}},
\code{\link{contrast.Design}}
}
\examples{
set.seed(191)
x <- exp(rnorm(200))
logit <- 1 + x/2
y <- ifelse(runif(200) <= plogis(logit), 1, 0)
f <- lrm(y ~ pol(x,2), x=TRUE, y=TRUE)
g <- bootcov(f, B=50, pr=TRUE, coef.reps=TRUE)
anova(g)    # using bootstrap covariance estimates
fastbw(g)   # using bootstrap covariance estimates
beta <- g$boot.Coef[,1]
hist(beta, nclass=15)     #look at normality of parameter estimates
qqnorm(beta)
# bootplot would be better than these last two commands


# A dataset contains a variable number of observations per subject,
# and all observations are laid out in separate rows. The responses
# represent whether or not a given segment of the coronary arteries
# is occluded. Segments of arteries may not operate independently
# in the same patient.  We assume a "working independence model" to
# get estimates of the coefficients, i.e., that estimates assuming
# independence are reasonably efficient.  The job is then to get
# unbiased estimates of variances and covariances of these estimates.


set.seed(1)
n.subjects <- 30
ages <- rnorm(n.subjects, 50, 15)
sexes  <- factor(sample(c('female','male'), n.subjects, TRUE))
logit <- (ages-50)/5
prob <- plogis(logit)  # true prob not related to sex
id <- sample(1:n.subjects, 300, TRUE) # subjects sampled multiple times
table(table(id))  # frequencies of number of obs/subject
age <- ages[id]
sex <- sexes[id]
# In truth, observations within subject are independent:
y   <- ifelse(runif(300) <= prob[id], 1, 0)
f <- lrm(y ~ lsp(age,50)*sex, x=TRUE, y=TRUE)
g <- bootcov(f, id, B=50)  # usually do B=200 or more
diag(g$var)/diag(f$var)
# add ,group=w to re-sample from within each level of w
anova(g)            # cluster-adjusted Wald statistics
# fastbw(g)         # cluster-adjusted backward elimination
plot(g, age=30:70, sex='female')  # cluster-adjusted confidence bands


# Get design effects based on inflation of the variances when compared
# with bootstrap estimates which ignore clustering
g2 <- bootcov(f, B=50)
diag(g$var)/diag(g2$var)


# Get design effects based on pooled tests of factors in model
anova(g2)[,1] / anova(g)[,1]


# Simulate binary data where there is a strong 
# age x sex interaction with linear age effects 
# for both sexes, but where not knowing that
# we fit a quadratic model.  Use the bootstrap
# to get bootstrap distributions of various
# effects, and to get pointwise and simultaneous
# confidence limits


set.seed(71)
n   <- 500
age <- rnorm(n, 50, 10)
sex <- factor(sample(c('female','male'), n, rep=TRUE))
L   <- ifelse(sex=='male', 0, .1*(age-50))
y   <- ifelse(runif(n)<=plogis(L), 1, 0)


f <- lrm(y ~ sex*pol(age,2), x=TRUE, y=TRUE)
b <- bootcov(f, B=50, coef.reps=TRUE, pr=TRUE)   # better: B=500


par(mfrow=c(2,3))
# Assess normality of regression estimates
bootplot(b, which=1:6, what='qq')
# They appear somewhat non-normal


# Plot histograms and estimated densities 
# for 6 coefficients
w <- bootplot(b, which=1:6)
# Print bootstrap quantiles
w$quantiles


# Estimate regression function for females
# for a sequence of ages
ages <- seq(25, 75, length=100)
label(ages) <- 'Age'


# Plot fitted function and pointwise normal-
# theory confidence bands
par(mfrow=c(1,1))
p <- plot(f, age=ages, sex='female')
w <- p$x.xbeta
# Save curve coordinates for later automatic
# labeling using labcurve in the Hmisc library
curves <- vector('list',8)
curves[[1]] <- list(x=w[,1],y=w[,3])
curves[[2]] <- list(x=w[,1],y=w[,4])


# Add pointwise normal-distribution confidence 
# bands using unconditional variance-covariance
# matrix from the 500 bootstrap reps
p <- plot(b, age=ages, sex='female', add=TRUE, lty=3)
w <- p$x.xbeta
curves[[3]] <- list(x=w[,1],y=w[,3])
curves[[4]] <- list(x=w[,1],y=w[,4])


dframe <- expand.grid(sex='female', age=ages)
X <- predict(f, dframe, type='x')  # Full design matrix


# Add pointwise bootstrap nonparametric 
# confidence limits
p <- confplot(b, X=X, against=ages, method='pointwise',
              add=TRUE, lty.conf=4)
curves[[5]] <- list(x=ages, y=p$lower)
curves[[6]] <- list(x=ages, y=p$upper)


# Add simultaneous bootstrap confidence band
p <- confplot(b, X=X, against=ages, add=TRUE, lty.conf=5)
curves[[7]] <- list(x=ages, y=p$lower)
curves[[8]] <- list(x=ages, y=p$upper)
lab <- c('a','a','b','b','c','c','d','d')
labcurve(curves, lab)


# Now get bootstrap simultaneous confidence set for
# female:male odds ratios for a variety of ages


dframe <- expand.grid(age=ages, sex=c('female','male'))
X <- predict(f, dframe, type='x')  # design matrix
f.minus.m <- X[1:100,] - X[101:200,]
# First 100 rows are for females.  By subtracting
# design matrices are able to get Xf*Beta - Xm*Beta
# = (Xf - Xm)*Beta


confplot(b, X=f.minus.m, against=ages,
         method='pointwise', ylab='F:M Log Odds Ratio')
confplot(b, X=f.minus.m, against=ages,
         lty.conf=3, add=TRUE)


# contrast.Design makes it easier to compute the design matrix for use
# in bootstrapping contrasts:


f.minus.m <- contrast(f, list(sex='female',age=ages),
                         list(sex='male',  age=ages))$X
confplot(b, X=f.minus.m)


# For a quadratic binary logistic regression model use bootstrap
# bumping to estimate coefficients under a monotonicity constraint
set.seed(177)
n <- 400
x <- runif(n)
logit <- 3*(x^2-1)
y <- rbinom(n, size=1, prob=plogis(logit))
f <- lrm(y ~ pol(x,2), x=TRUE, y=TRUE)
k <- coef(f)
k
vertex <- -k[2]/(2*k[3])
vertex


# Outside [0,1] so fit satisfies monotonicity constraint within
# x in [0,1], i.e., original fit is the constrained MLE


g <- bootcov(f, B=50, coef.reps=TRUE)
bootcoef <- g$boot.Coef    # 100x3 matrix
vertex <- -bootcoef[,2]/(2*bootcoef[,3])
table(cut2(vertex, c(0,1)))
mono <- !(vertex >= 0 & vertex <= 1)
mean(mono)    # estimate of Prob{monotonicity in [0,1]}


var(bootcoef)   # var-cov matrix for unconstrained estimates
var(bootcoef[mono,])   # for constrained estimates


# Find second-best vector of coefficient estimates, i.e., best
# from among bootstrap estimates
g$boot.Coef[order(g$boot.loglik[-length(g$boot.loglik)])[1],]
# Note closeness to MLE
}
\keyword{models}
\keyword{regression}
\keyword{htest}
\keyword{methods}
\keyword{hplot}
\concept{bootstrap}
\concept{sampling}

\eof
\name{calibrate}
\alias{calibrate}
\alias{calibrate.default}
\alias{calibrate.cph}
\alias{calibrate.psm}
\alias{print.calibrate}
\alias{print.calibrate.default}
\alias{plot.calibrate}
\alias{plot.calibrate.default}
\title{
Resampling Model Calibration
}
\description{
Uses bootstrapping or cross-validation to get bias-corrected (overfitting-
corrected) estimates of predicted vs. observed values based on
subsetting predictions into intervals (for survival models) or on
nonparametric smoothers (for other models). There are calibration
functions for Cox (\code{cph}), parametric survival models (\code{psm}),
binary and ordinal logistic models (\code{lrm}) and ordinary least squares (\code{ols}).
For survival models,
"predicted" means predicted survival probability at a single
time point, and "observed" refers to the corresponding Kaplan-Meier 
survival estimate, stratifying on intervals of predicted survival.
For logistic and linear models, a nonparametric calibration curve is
estimated over a sequence of predicted values.
The fit must have specified \code{x=TRUE, y=TRUE}.  The \code{print} and \code{plot} methods
for \code{lrm} and \code{ols} models (which use \code{calibrate.default}) print the mean
absolute error in predictions, the mean squared error, and the 0.9 quantile
of the absolute error.  Here, error refers to the difference between the
predicted values and the corresponding bias-corrected calibrated values.

Below, the second, third, and fourth invocations of \code{calibrate}
are, respectively, for \code{ols} and \code{lrm}, \code{cph}, and
\code{psm}.  The first and second \code{plot} invocation are
respectively for \code{lrm} and \code{ols} fits or all other fits.
}
\usage{
calibrate(fit, \dots)
\method{calibrate}{default}(fit, predy, 
  method=c("boot","crossvalidation",".632","randomization"),
  B=40, bw=FALSE, rule=c("aic","p"),
  type=c("residual","individual"),
  sls=.05, pr=FALSE, kint, smoother="lowess", \dots)
\method{calibrate}{cph}(fit, method="boot", u, m=150, cuts, B=40, 
  bw=FALSE, rule="aic", type="residual", sls=0.05, aics=0, 
  pr=FALSE, what="observed-predicted", tol=1e-12, \dots)
\method{calibrate}{psm}(fit, method="boot", u, m=150, cuts, B=40,
  bw=FALSE,rule="aic",
  type="residual",sls=.05,aics=0,
  pr=FALSE,what="observed-predicted",tol=1e-12, maxiter=15, 
  rel.tolerance=1e-5, \dots)

\method{print}{calibrate}(x, \dots)
\method{print}{calibrate.default}(x, \dots)

\method{plot}{calibrate}(x, xlab, ylab, subtitles=TRUE, conf.int=TRUE,
\dots)

\method{plot}{calibrate.default}(x, xlab, ylab, xlim, ylim,
  legend=TRUE, subtitles=TRUE, \dots)
}
\arguments{
\item{fit}{
a fit from \code{ols}, \code{lrm}, \code{cph} or \code{psm}
}
\item{x}{an object created by \code{calibrate}}
\item{method}{}
\item{B}{}
\item{bw}{}
\item{rule}{}
\item{type}{}
\item{sls}{}
\item{aics}{see \code{\link{validate}}}
\item{u}{
the time point for which to validate predictions for survival models. For \code{cph} fits,
you must have specified \code{surv=TRUE, time.inc=u}, where \code{u} is
the constant specifying the time to predict.
}
\item{m}{
group predicted \code{u}-time units survival into intervals containing
\code{m} subjects on the average (for survival models only)
}
\item{cuts}{
actual cut points for predicted survival probabilities. You may
specify only one of \code{m} and \code{cuts} (for survival models only)
}
\item{pr}{
set to \code{TRUE} to print intermediate results for each re-sample
}
\item{what}{
The default is \code{"observed-predicted"}, meaning to estimate optimism
in this difference. This is preferred as it accounts for skewed
distributions of predicted probabilities in outer intervals. You can
also specify \code{"observed"}.  This argument applies to survival models only.
}
\item{tol}{criterion for matrix singularity (default is \code{1e-12})}
\item{maxiter}{for \code{psm}, this is passed to
  \code{\link[survival]{survreg.control}} (default is 15 iterations)
}
\item{rel.tolerance}{parameter passed to
  \code{\link[survival]{survreg.control}} for \code{psm} (default is 1e-5).
  }
\item{predy}{
a scalar or vector of predicted values to calibrate (for \code{lrm},
\code{ols}).  Default is 50 equally spaced points between the 5th
smallest and the 5th largest  predicted values.  For \code{lrm} the
predicted values are probabilities (see \code{kint}).
}
\item{kint}{
For an ordinal logistic model the default predicted
probability that \eqn{Y\geq} the middle level.  Specify \code{kint} to specify the
intercept to use, e.g., \code{kint=2} means to calibrate \eqn{Prob(Y\geq
  b)}, where \eqn{b} is the second level of \eqn{Y}.
}
\item{smoother}{
a function in two variables which produces \eqn{x}- and
\eqn{y}-coordinates by smoothing the input \code{y}.  The default is to
use \code{lowess(x, y, iter=0)}. 
}
\item{\dots}{
other arguments to pass to \code{predab.resample}, such as \code{group},
\code{cluster}, and \code{subset}.
Also, other arguments for \code{plot}.
}
\item{xlab}{
defaults to "Predicted x-units Survival" or to a suitable label for
other models
}
\item{ylab}{
defaults to "Fraction Surviving x-units" or to a suitable label for
other models
}
\item{xlim}{}
\item{ylim}{2-vectors specifying x- and y-axis limits, if not using defaults}
\item{subtitles}{
set to \code{FALSE} to suppress subtitles in plot describing method and for \code{lrm}
and \code{ols} the mean absolute error and original sample size
}
\item{conf.int}{
set to \code{FALSE} to suppress plotting 0.95 confidence intervals for
Kaplan-Meier estimates
}
\item{legend}{
set to \code{FALSE} to suppress legends (for \code{lrm}, \code{ols}
only) on the calibration plot, or specify a list with elements \code{x}
and \code{y} containing the coordinates of the upper left corner of the
legend.  By default, a legend will be drawn in the lower right 1/16th of
the plot.
}}
\value{
matrix specifying mean predicted survival in each interval, the
corresponding estimated bias-corrected Kaplan-Meier estimates,
number of subjects, and other statistics.  For linear and logistic models,
the matrix instead has rows corresponding to the prediction points, and
the vector of predicted values being validated is returned as an attribute.
The returned object has class \code{"calibrate"} or \code{"calibrate.default"}.
}
\section{Side Effects}{
prints, and stores an object \code{pred.obs} or \code{.orig.cal}
}
\details{
If the fit was created using penalized maximum likelihood estimation,
the same \code{penalty} and \code{penalty.scale} parameters are used during
validation.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{validate}}, \code{\link{predab.resample}}, \code{\link{groupkm}}, \code{\link[Hmisc]{errbar}}, \code{\link{cph}}, \code{\link{psm}},
\code{\link{lowess}}
}
\examples{
set.seed(1)
d.time <- rexp(200)
x1 <- runif(200)
x2 <- factor(sample(c('a','b','c'),200,TRUE))
f <- cph(Surv(d.time) ~ pol(x1,2)*x2, x=TRUE, y=TRUE, surv=TRUE, time.inc=2)
#or f <- psm(S ~ \dots)
cal <- calibrate(f, u=2, m=50, B=20)  # usually B=200 or 300
plot(cal)


y <- sample(0:2, 200, TRUE)
x1 <- runif(200)
x2 <- runif(200)
x3 <- runif(200)
x4 <- runif(200)
f <- lrm(y ~ x1+x2+x3*x4, x=TRUE, y=TRUE)
cal <- calibrate(f, kint=2, predy=seq(.2,.8,length=60), 
                 group=y)
# group= does k-sample validation: make resamples have same 
# numbers of subjects in each level of y as original sample


plot(cal)
#See the example for the validate function for a method of validating
#continuation ratio ordinal logistic models.  You can do the same
#thing for calibrate
}
\keyword{methods}
\keyword{models}
\keyword{regression}
\keyword{survival}
\keyword{hplot}
\concept{bootstrap}
\concept{model validation}
\concept{calibration}
\concept{model reliability}
\concept{predictive accuracy}

\eof
\name{contrast.Design}
\alias{contrast}
\alias{contrast.Design}
\alias{print.contrast.Design}
\title{
General Contrasts of Regression Coefficients
}
\description{
This function computes one or more contrasts of the estimated
regression coefficients in a fit from one of the functions in Design,
along with standard errors, confidence limits, t or Z statistics, P-values.
General contrasts are handled by obtaining the design matrix for two
sets of predictor settings (\code{a}, \code{b}) and subtracting the
corresponding rows of the two design matrics to obtain a new contrast
design matrix for testing the \code{a} - \code{b} differences.  This allows for
quite general contrasts (e.g., estimated differences in means between
a 30 year old female and a 40 year old male).
This can also be used
to obtain a series of contrasts in the presence of interactions (e.g.,
female:male log odds ratios for several ages when the model contains
age by sex interaction).  Another use of \code{contrast} is to obtain
center-weighted (Type III test) and subject-weighted (Type II test)
estimates in a model containing treatment by center interactions.  For
the latter case, you can specify \code{type="average"} and an optional
\code{weights} vector to average the within-center treatment contrasts.
The design contrast matrix computed by \code{contrast.Design} can be used
by the \code{bootplot} and \code{confplot} functions to obtain bootstrap
nonparametric confidence intervals for contrasts.

By omitting the \code{b} argument, \code{contrast} can be used to obtain
an average or weighted average of a series of predicted values, along
with a confidence interval for this average.  This can be useful for
"unconditioning" on one of the predictors (see the next to last
example).

When more than one contrast is computed, the list created by
\code{contrast.Design} is suitable for plotting (with error bars or bands)
with \code{xYplot} or \code{Dotplot} (see the last example).
}
\usage{
contrast(fit, \dots)
\method{contrast}{Design}(fit, a, b, cnames=NULL, 
         type=c("individual", "average"), 
         weights="equal", conf.int=0.95, \dots)

\method{print}{contrast.Design}(x, X=FALSE, fun=function(u)u, ...)
}
\arguments{
\item{fit}{
a fit of class \code{"Design"}
}
\item{a}{
a list containing settings for all predictors that you do not wish to
set to default (adjust-to) values.  Usually you will specify two
variables in this list, one set to a constant and one to a sequence of
values, to obtain contrasts for the sequence of values of an
interacting factor.  The \code{gendata} function will generate the
necessary combinations and default values for unspecified predictors.
}
\item{b}{
another list that generates the same number of observations as \code{a},
unless one of the two lists generates only one observation.  In that
case, the design matrix generated from the shorter list will have its
rows replicated so that the contrasts assess several differences
against the one set of predictor values.  This is useful for comparing
multiple treatments with control, for example.  If \code{b} is missing, the
design matrix generated from \code{a} is analyzed alone.
}
\item{cnames}{
vector of character strings naming the contrasts when
\code{type="individual"}.  Usually \code{cnames} is not necessary as
\code{contrast.Design} tries to name the contrasts by examining which
predictors are varying consistently in the two lists.  \code{cnames} will
be needed when you contrast "non-comparable" settings, e.g., you
compare \code{list(treat="drug", age=c(20,30))} with
\code{list(treat="placebo"), age=c(40,50))}
}
\item{type}{
set \code{type="average"} to average the individual contrasts (e.g., to
obtain a Type II or III contrast)
}
\item{weights}{
a numeric vector, used when \code{type="average"}, to obtain weighted contrasts
}
\item{conf.int}{
confidence level for confidence intervals for the contrasts
}
\item{\dots}{unused}
\item{x}{result of \code{contrast}}
\item{X}{
set \code{X=TRUE} to  print design matrix used in computing the contrasts (or
the average contrast)
}
\item{fun}{
a function to transform the contrast, SE, and lower and upper
confidence limits before printing.  For example, specify \code{fun=exp} to
anti-log them for logistic models.
}}
\value{
a list of class \code{"contrast.Design"} containing the elements
\code{Contrast}, \code{SE}, \code{Z}, \code{var}, \code{df.residual}
\code{Lower}, \code{Upper}, \code{Pvalue}, \code{X}, \code{cnames}, which denote the contrast
estimates, standard errors, Z or t-statistics, variance matrix,
residual degrees of freedom (this is \code{NULL} if the model was not
\code{ols}), lower and upper confidence limits, 2-sided P-value, design
matrix, and contrast names (or \code{NULL}).
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University School of Medicine\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{predict.Design}}, \code{\link{gendata}}, \code{\link{bootcov}}, \code{\link{summary.Design}},
\code{\link{anova.Design}}, \code{\link{plot.Design}}
}
\examples{
set.seed(1)
age <- rnorm(200,40,12)
sex <- factor(sample(c('female','male'),200,TRUE))
logit <- (sex=='male') + (age-40)/5
y <- ifelse(runif(200) <= plogis(logit), 1, 0)
f <- lrm(y ~ pol(age,2)*sex)
# Compare a 30 year old female to a 40 year old male
# (with or without age x sex interaction in the model)
contrast(f, list(sex='female', age=30), list(sex='male', age=40))


# For a model containing two treatments, centers, and treatment
# x center interaction, get 0.95 confidence intervals separately
# by cente
center <- factor(sample(letters[1:8],500,TRUE))
treat  <- factor(sample(c('a','b'),  500,TRUE))
y      <- 8*(treat=='b') + rnorm(500,100,20)
f <- ols(y ~ treat*center)


lc <- levels(center)
contrast(f, list(treat='b', center=lc),
            list(treat='a', center=lc))


# Get 'Type III' contrast: average b - a treatment effect over
# centers, weighting centers equally (which is almost always
# an unreasonable thing to do)
contrast(f, list(treat='b', center=lc),
            list(treat='a', center=lc),
         type='average')


# Get 'Type II' contrast, weighting centers by the number of
# subjects per center.  Print the design contrast matrix used.
k <- contrast(f, list(treat='b', center=lc),
                 list(treat='a', center=lc),
              type='average', weights=table(center))
print(k, X=TRUE)
# Note: If other variables had interacted with either treat 
# or center, we may want to list settings for these variables
# inside the list()'s, so as to not use default settings


# For a 4-treatment study, get all comparisons with treatment 'a'
treat  <- factor(sample(c('a','b','c','d'),  500,TRUE))
y      <- 8*(treat=='b') + rnorm(500,100,20)
dd     <- datadist(treat,center); options(datadist='dd')
f <- ols(y ~ treat*center)
lt <- levels(treat)
contrast(f, list(treat=lt[-1]),
            list(treat=lt[ 1]),
         cnames=paste(lt[-1],lt[1],sep=':'), conf.int=1-.05/3)


# Compare each treatment with average of all others
for(i in 1:length(lt)) {
  cat('Comparing with',lt[i],'\n\n')
  print(contrast(f, list(treat=lt[-i]),
                    list(treat=lt[ i]), type='average'))
}
options(datadist=NULL)

# Six ways to get the same thing, for a variable that
# appears linearly in a model and does not interact with
# any other variables.  We estimate the change in y per
# unit change in a predictor x1.  Methods 4, 5 also
# provide confidence limits.  Method 6 computes nonparametric
# bootstrap confidence limits.  Methods 2-6 can work
# for models that are nonlinear or non-additive in x1.
# For that case more care is needed in choice of settings
# for x1 and the variables that interact with x1.


\dontrun{
coef(fit)['x1']                            # method 1
diff(predict(fit, gendata(x1=c(0,1))))     # method 2
g <- Function(fit)                         # method 3
g(x1=1) - g(x1=0)
summary(fit, x1=c(0,1))                    # method 4
k <- contrast(fit, list(x1=1), list(x1=0)) # method 5
print(k, X=TRUE)
fit <- update(fit, x=TRUE, y=TRUE)               # method 6
b <- bootcov(fit, B=500, coef.reps=TRUE)
bootplot(b, X=k$X)    # bootstrap distribution and CL


# In a model containing age, race, and sex,
# compute an estimate of the mean response for a
# 50 year old male, averaged over the races using
# observed frequencies for the races as weights


f <- ols(y ~ age + race + sex)
contrast(f, list(age=50, sex='male', race=levels(race)),
         type='average', weights=table(race))
}


# Plot the treatment effect (drug - placebo) as a function of age
# and sex in a model in which age nonlinearly interacts with treatment
# for females only
set.seed(1)
n <- 800
treat <- factor(sample(c('drug','placebo'), n,TRUE))
sex   <- factor(sample(c('female','male'),  n,TRUE))
age   <- rnorm(n, 50, 10)
y     <- .05*age + (sex=='female')*(treat=='drug')*.05*abs(age-50) + rnorm(n)
f     <- ols(y ~ rcs(age,4)*treat*sex)
d     <- datadist(age, treat, sex); options(datadist='d')
# show separate estimates by treatment and sex
plot(f, age=NA, treat=NA, sex='female')
plot(f, age=NA, treat=NA, sex='male')
ages  <- seq(35,65,by=5); sexes <- c('female','male')
w     <- contrast(f, list(treat='drug',    age=ages, sex=sexes),
                     list(treat='placebo', age=ages, sex=sexes))
xYplot(Cbind(Contrast, Lower, Upper) ~ age | sex, data=w,
       ylab='Drug - Placebo')
xYplot(Cbind(Contrast, Lower, Upper) ~ age, groups=sex, data=w,
       ylab='Drug - Placebo', method='alt bars')
options(datadist=NULL)
}
\keyword{htest}
\keyword{models}
\keyword{regression}
% Converted by Sd2Rd version 1.21.

\eof
\name{cph}
\alias{cph}
\alias{Survival.cph}
\alias{Quantile.cph}
\alias{Mean.cph}
\title{Cox Proportional Hazards Model and Extensions}
\description{
Modification of Therneau's \code{coxph} function to fit the Cox model and
its extension, the Andersen-Gill model. The latter allows for interval
time-dependent covariables, time-dependent strata, and repeated events.
The \code{Survival} method for an object created by \code{cph} returns an S
function for computing estimates of the survival function.
The \code{Quantile} method for \code{cph} returns an S function for computing
quantiles of survival time (median, by default).
The \code{Mean} method returns a function for computing the mean survival
time.  This function issues a warning if the last follow-up time is uncensored,
unless a restricted mean is explicitly requested.
}
\usage{
cph(formula = formula(data), data=if(.R.) parent.frame() else sys.parent(),
    weights, subset, na.action=na.delete, 
    method=c("efron","breslow","exact","model.frame","model.matrix"), 
    singular.ok=FALSE, robust=FALSE,
    model=FALSE, x=FALSE, y=FALSE, se.fit=FALSE, 
    eps=1e-4, init, iter.max=10, tol=1e-9, surv=FALSE, time.inc,
    type, vartype, conf.type, \dots)

\method{Survival}{cph}(object, \dots)
# Evaluate result as g(times, lp, stratum=1, type=c("step","polygon"))

\method{Quantile}{cph}(object, \dots)
# Evaluate like h(q, lp, stratum=1, type=c("step","polygon"))

\method{Mean}{cph}(object, method=c("exact","approximate"), type=c("step","polygon"),
          n=75, tmax, \dots)
# E.g. m(lp, stratum=1, type=c("step","polygon"), tmax, \dots)
}
\arguments{
\item{formula}{
an S formula object with a \code{Surv} object on the left-hand side.
The \code{terms} can specify any S model formula with up to third-order interactions.  The \code{strat}
function may appear in the terms, as a main effect or an interacting
factor.  To stratify on both race and sex, you would include both
terms \code{strat(race)} and \code{strat(sex)}.  Stratification
factors may interact with non-stratification factors;
not all stratification terms need interact with the same modeled
factors.
}
\item{object}{
an object created by \code{cph} with \code{surv=TRUE}
}
\item{data}{
name of an S data frame containing all needed variables.  Omit this to use a
data frame already in the S ``search list''.
}
\item{weights}{
case weights
}
\item{subset}{
an expression defining a subset of the observations to use in the fit.  The default
is to use all observations.  Specify for example \code{age>50 & sex="male"} or
\code{c(1:100,200:300)}
respectively to use the observations satisfying a logical expression or those having
row numbers in the given vector.
}
\item{na.action}{
specifies an S function to handle missing data.  The default is the function \code{na.delete},
which causes observations with any variable missing to be deleted.  The main difference
between \code{na.delete} and the S-supplied function \code{na.omit} is that 
\code{na.delete} makes a list
of the number of observations that are missing on each variable in the model.
The \code{na.action} is usally specified by e.g. \code{options(na.action="na.delete")}.
}
\item{method}{
for \code{cph}, specifies a particular fitting method, \code{"model.frame"} instead to return the model frame
of the predictor and response variables satisfying any subset or missing value
checks, or \code{"model.matrix"} to return the expanded design matrix.
The default is \code{"efron"}, to use Efron's likelihood for fitting the
model.

For \code{Mean.cph}, \code{method} is \code{"exact"} to use numerical
integration of the 
survival function at any linear predictor value to obtain a mean survival
time.  Specify \code{method="approximate"} to use an approximate method that is
slower when \code{Mean.cph} is executing but then is essentially instant
thereafter.  For the approximate method, the area is computed for \code{n}
points equally spaced between the min and max observed linear predictor
values.  This calculation is done separately for each stratum.  Then the
\code{n} pairs (X beta, area) are saved in the generated S function, and when
this function is evaluated, the \code{approx} function is used to evaluate
the mean for any given linear predictor values, using linear interpolation
over the \code{n} X beta values.
}
\item{singular.ok}{
If \code{TRUE}, the program will automatically skip over columns of the X matrix
that are linear combinations of earlier columns.  In this case the
coefficients for such columns will be NA, and the variance matrix will contain
zeros.  For ancillary calculations, such as the linear predictor, the missing
coefficients are treated as zeros.  The singularities will prevent many of
the features of the \code{Design} library from working.
}
\item{robust}{
if \code{TRUE} a robust variance estimate is returned.  Default is \code{TRUE} if the
model includes a \code{cluster()} operative, \code{FALSE} otherwise.
}
\item{model}{
default is \code{FALSE}(false).  Set to \code{TRUE} to return the model frame as element 
\code{model} of the fit object.
}
\item{x}{
default is \code{FALSE}.  Set to \code{TRUE} to return the expanded design matrix as element \code{x}
(without intercept indicators) of the
returned fit object.
}
\item{y}{
default is \code{FALSE}.  Set to \code{TRUE} to return the vector of response values (\code{Surv}
object) as element \code{y} of the fit.
}
\item{se.fit}{
default is \code{FALSE}.  Set to \code{TRUE} to compute the estimated standard errors of
the estimate of X beta and store them in element \code{se.fit}
of the fit.  The predictors are first centered to their means
before computing the standard errors.
}
\item{eps}{
convergence criterion - change in log likelihood.
}
\item{init}{
vector of initial parameter estimates.  Defaults to all zeros.
Special residuals can be obtained by setting some elements of \code{init}
to MLEs and others to zero and specifying \code{iter.max=1}.
}
\item{iter.max}{
maximum number of iterations to allow.  Set to \code{0} to obtain certain
null-model residuals.
}
\item{tol}{
tolerance for declaring singularity for matrix inversion (available
only when survival5 or later package is in effect)
}
\item{surv}{
set to \code{TRUE} to compute underlying survival estimates for each
stratum, and to store these along with standard errors of log Lambda(t),
\code{maxtime} (maximum observed survival or censoring time),
and \code{surv.summary} in the returned object.  Set \code{surv="summary"}
to only compute and store \code{surv.summary}, not survival estimates
at each unique uncensored failure time. If you specify \code{x=Y} and \code{y=TRUE},
you can obtain predicted survival later, with accurate confidence
intervals for any set of predictor values. The standard error information
stored as a result of \code{surv=TRUE} are only accurate at the mean of all
predictors. If the model has no covariables, these are of course OK.
The main reason for using \code{surv} is to greatly speed up the computation
of predicted survival probabilities as a function of the covariables,
when accurate confidence intervals are not needed.
}
\item{time.inc}{
time increment used in deriving \code{surv.summary}.  Survival,
number at risk, and standard error will be stored for 
\code{t=0, time.inc, 2 time.inc, \dots, maxtime},
where \code{maxtime} is the maximum survival time over all strata.
\code{time.inc} is also used in constructing the time axis in the
\code{survplot} function (see below).  The default value for
\code{time.inc} is 30 if \code{units(ftime) = "Day"} or no \code{units}
attribute has been attached to the survival time variable.  If
\code{units(ftime)} is a word other than \code{"Day"}, the default
for \code{time.inc} is 1 when it is omitted, unless \code{maxtime<1}, then
\code{maxtime/10} is used as \code{time.inc}.  If \code{time.inc} is not given and
\code{maxtime/ default time.inc} > 25, \code{time.inc} is increased.
}
\item{type}{
(for \code{cph}) applies if \code{surv} is \code{TRUE} or \code{"summary"}. 
If \code{type} is omitted, the method consistent with \code{method} is used.
See \code{survfit.coxph} (under \code{survfit}) or \code{survfit.cph} for details and for the
definitions of values of \code{type}

For \code{Survival, Quantile, Mean} set to \code{"polygon"} to use linear 
interpolation instead of the usual step function.  For \code{Mean}, the default
of \code{step} will yield the sample mean in the case of no censoring and no
covariables, if \code{type="kaplan-meier"} was specified to \code{cph}.
For \code{method="exact"}, the value of \code{type} is passed to the
generated function, and it can be overridden when that function is
actually invoked. For \code{method="approximate"}, \code{Mean.cph}
generates the function different ways according to \code{type}, and this
cannot be changed when the function is actually invoked.
}
\item{vartype}{see \code{survfit.coxph}}
\item{conf.type}{
see \code{survfit.cph}; default bases confidence limits of log -log survival.
}
\item{\dots}{
other arguments passed to \code{coxph.fit} from \code{cph}.  Ignored by
other functions.
}
\item{times}{
a scalar or vector of times at which to evaluate the survival estimates
}
\item{lp}{
a scalar or vector of linear predictors (including the centering constant)
at which to evaluate the survival estimates
}
\item{stratum}{
a scalar stratum number or name (e.g., \code{"sex=male"}) to use in getting
survival probabilities
}
\item{q}{
a scalar quantile or a vector of quantiles to compute
}
\item{n}{
the number of points at which to evaluate the mean survival time, for
\code{method="approximate"} in \code{Mean.cph}.
}
\item{tmax}{
For \code{Mean.cph}, the default is to compute the overall mean (and produce
a warning message if there is censoring at the end of follow-up).
To compute a restricted mean life length, specify the truncation point as \code{tmax}.
For \code{method="exact"}, \code{tmax} is passed to the generated function and it
may be overridden when that function is invoked.  For \code{method="approximate"},
\code{tmax} must be specified at the time that \code{Mean.cph} is run.
}}
\value{
For \code{Survival}, \code{Quantile}, or \code{Mean}, an S function is returned.  Otherwise,
in addition to what is listed below, formula/design information and
the components 
\code{maxtime, time.inc, units, model, x, y, se.fit} are stored, the last 5 
depending on the settings of options by the same names.
The vectors or matrix stored if \code{y=TRUE} or \code{x=TRUE} have rows deleted according to \code{subset} and
to missing data, and have names or row names that come from the
data frame used as input data.

\item{n}{
table with one row per stratum containing number of censored and uncensored observations
}
\item{coef}{
vector of regression coefficients
}
\item{stats}{
vector containing the named elements \code{Obs}, \code{Events}, \code{Model L.R.}, \code{d.f.},
\code{P}, \code{Score}, \code{Score P}, and \code{R2}.
}
\item{var}{
variance/covariance matrix of coefficients
}
\item{linear.predictors}{
values of predicted X beta for observations used in fit, normalized
to have overall mean zero
}
\item{resid}{
martingale residuals
}
\item{loglik}{
log likelihood at initial and final parameter values
}
\item{score}{
value of score statistic at initial values of parameters
}
\item{times}{
lists of times (if \code{surv="T"})
}
\item{surv}{
lists of underlying survival probability estimates
}
\item{std.err}{
lists of standard errors of estimate log-log survival
}
\item{surv.summary}{
a 3 dimensional array if \code{surv=TRUE}.  
The first dimension is time ranging from 0 to
\code{maxtime} by \code{time.inc}.  The second dimension refers to strata.
The third dimension contains the time-oriented matrix with
\code{Survival, n.risk} (number of subjects at risk), 
and \code{std.err} (standard error of log-log
survival). 
}
\item{center}{
centering constant, equal to overall mean of X beta.
}}
\details{
If there is any strata by covariable interaction in the model such that
the mean X beta varies greatly over strata, \code{method="approximate"} may
not yield very accurate estimates of the mean in \code{Mean.cph}.


For \code{method="approximate"} if you ask for an estimate of the mean for
a linear predictor value that was outside the range of linear predictors
stored with the fit, the mean for that observation will be \code{NA}.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[survival]{coxph}}, \code{\link[survival]{coxph.fit}}, \code{\link[survival]{Surv}}, \code{\link{residuals.cph}}, \code{\link[survival]{cox.zph}},
\code{\link{survfit.cph}},  \code{\link{survest.cph}}, \code{\link[survival]{survfit.coxph}},
\code{\link{survplot}}, \code{\link{datadist}},
\code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{anova.Design}}, \code{\link{summary.Design}}, \code{\link{predict.Design}},
\code{\link{fastbw}}, \code{\link{validate}}, \code{\link{calibrate}}, \code{\link{plot.Design}},
\code{\link{specs.Design}}, \code{\link{lrm}}, \code{\link{which.influence}}, \code{\link[Hmisc]{na.delete}}, \code{\link[Hmisc]{na.detail.response}},
\code{\link[Hmisc]{naresid}}, \code{\link{print.cph}}, \code{\link{latex.cph}}, \code{\link{vif}}, \code{\link{ie.setup}}
}
\examples{
# Simulate data from a population model in which the log hazard
# function is linear in age and there is no age x sex interaction
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n, 
              rep=TRUE, prob=c(.6, .4)))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
dt <- -log(runif(n))/h
label(dt) <- 'Follow-up Time'
e <- ifelse(dt <= cens,1,0)
dt <- pmin(dt, cens)
units(dt) <- "Year"
dd <- datadist(age, sex)
options(datadist='dd')
Srv <- Surv(dt,e)


f <- cph(Srv ~ rcs(age,4) + sex, x=TRUE, y=TRUE)
cox.zph(f, "rank")             # tests of PH
anova(f)
plot(f, age=NA, sex=NA)      # plot age effect, 2 curves for 2 sexes
survplot(f, sex=NA)             # time on x-axis, curves for x2
res <- resid(f, "scaledsch")
time <- as.numeric(dimnames(res)[[1]])
z <- loess(res[,4] ~ time, span=0.50)   # residuals for sex
if(.R.) plot(time, fitted(z)) else
plot(z, coverage=0.95, confidence=7, xlab="t", 
     ylab="Scaled Schoenfeld Residual",ylim=c(-3,5))
lines(supsmu(time, res[,4]),lty=2)
plot(cox.zph(f,"identity"))    #Easier approach for last 6 lines
# latex(f)


f <- cph(Srv ~ age + strat(sex), surv=TRUE)
g <- Survival(f)   # g is a function
g(seq(.1,1,by=.1), stratum="sex=Male", type="poly") #could use stratum=2
med <- Quantile(f)
plot(f, age=NA, fun=function(x) med(lp=x))          #plot median survival


# g <- cph(Surv(hospital.charges) ~ age, surv=TRUE)
# Cox model very useful for analyzing highly skewed data, censored or not
# m <- Mean(g)
# m(0)                           # Predicted mean charge for reference age


#Fit a time-dependent covariable representing the instantaneous effect
#of an intervening non-fatal event
rm(age)
set.seed(121)
dframe <- data.frame(failure.time=1:10, event=rep(0:1,5),
                     ie.time=c(NA,1.5,2.5,NA,3,4,NA,5,5,5), 
                     age=sample(40:80,10,rep=TRUE))
z <- ie.setup(dframe$failure.time, dframe$event, dframe$ie.time)
S <- z$S
ie.status <- z$ie.status
attach(dframe[z$subs,])    # replicates all variables


f <- cph(S ~ age + ie.status, x=TRUE, y=TRUE)  
#Must use x=TRUE,y=TRUE to get survival curves with time-dep. covariables


#Get estimated survival curve for a 50-year old who has an intervening
#non-fatal event at 5 days
new <- data.frame(S=Surv(c(0,5), c(5,999), c(FALSE,FALSE)), age=rep(50,2),
                  ie.status=c(0,1))
g <- survfit(f, new)
plot(c(0,g$time), c(1,g$surv[,2]), type='s', 
     xlab='Days', ylab='Survival Prob.')
# Not certain about what columns represent in g$surv for survival5
# but appears to be for different ie.status
#or:
#g <- survest(f, new)
#plot(g$time, g$surv, type='s', xlab='Days', ylab='Survival Prob.')


#Compare with estimates when there is no intervening event
new2 <- data.frame(S=Surv(c(0,5), c(5, 999), c(FALSE,FALSE)), age=rep(50,2),
                   ie.status=c(0,0))
g2 <- survfit(f, new2)
lines(c(0,g2$time), c(1,g2$surv[,2]), type='s', lty=2)
#or:
#g2 <- survest(f, new2)
#lines(g2$time, g2$surv, type='s', lty=2)
detach("dframe[z$subs, ]")
options(datadist=NULL)
}
\keyword{survival}
\keyword{models}
\keyword{nonparametric}


\eof
\name{cr.setup}
\alias{cr.setup}
\title{
Continuation Ratio Ordinal Logistic Setup
}
\description{
Creates several new variables which help set up a dataset with an
ordinal response variable \eqn{y} for use in fitting a forward continuation
ratio (CR) model.  The CR model can be fitted with binary logistic
regression if each input observation is replicated the proper
number of times according to the \eqn{y} value, a new binary \eqn{y} is computed
that has at most one \eqn{y=1} per subject,
and if a \code{cohort} variable
is used to define the current qualifying condition for a cohort of
subjects, e.g., \eqn{y\geq 2}.  \code{cr.setup} creates the needed auxilliary variables.
See \code{predab.resample} and \code{validate.lrm} for information about validating
CR models (e.g., using the bootstrap to sample with replacement from the
original subjects instead of the records used in the fit, validating
the model separately for user-specified values of \code{cohort}).
}
\usage{
cr.setup(y)
}
\arguments{
\item{y}{
a character, numeric, \code{category}, or \code{factor} vector containing values of
the response variable.  For \code{category} or \code{factor} variables, the
\code{levels} of the variable are assumed to be listed in an ordinal way.
}}
\value{
a list with components \code{y, cohort, subs, reps}.  \code{y} is a new binary
variable that is to be used in the binary logistic fit.  \code{cohort} is 
a \code{factor} vector specifying which cohort condition currently applies.
\code{subs} is a vector of subscripts that can be used to replicate other
variables the same way \code{y} was replicated.  \code{reps} specifies how many
times each original observation was replicated.  \code{y, cohort, subs} are
all the same length and are longer than the original \code{y} vector.
\code{reps} is the same length as the original \code{y} vector.
The \code{subs} vector is suitable for passing to \code{validate.lrm} or \code{calibrate},
which pass this vector under the name \code{cluster} on to \code{predab.resample} so that bootstrapping can be
done by sampling with replacement from the original subjects rather than
from the individual records created by \code{cr.setup}.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Berridge DM, Whitehead J: Analysis of failure time data with ordinal
categories of response.  Stat in Med 10:1703--1710, 1991.
}
\seealso{
\code{\link{lrm}}, \code{\link{glm}}, \code{\link{predab.resample}}
}
\examples{
y <- c(NA, 10, 21, 32, 32)
cr.setup(y)


set.seed(171)
y <- sample(0:2, 100, rep=TRUE)
sex <- sample(c("f","m"),100,rep=TRUE)
sex <- factor(sex)
table(sex, y)
options(digits=5)
tapply(y==0, sex, mean)
tapply(y==1, sex, mean)
tapply(y==2, sex, mean)
cohort <- y>=1
tapply(y[cohort]==1, sex[cohort], mean)


u <- cr.setup(y)
Y <- u$y
cohort <- u$cohort
sex <- sex[u$subs]


lrm(Y ~ cohort + sex)


 
f <- lrm(Y ~ cohort*sex)   # saturated model - has to fit all data cells
f


# In S-Plus:
#Prob(y=0|female):
# plogis(-.50078)
#Prob(y=0|male):
# plogis(-.50078+.11301)
#Prob(y=1|y>=1, female):
plogis(-.50078+.31845)
#Prob(y=1|y>=1, male):
plogis(-.50078+.31845+.11301-.07379)


combinations <- expand.grid(cohort=levels(cohort), sex=levels(sex))
combinations
p <- predict(f, combinations, type="fitted")
p
p0 <- p[c(1,3)]
p1 <- p[c(2,4)]
p1.unconditional <- (1 - p0) *p1
p1.unconditional
p2.unconditional <- 1 - p0 - p1.unconditional
p2.unconditional


\dontrun{
dd <- datadist(inputdata)   # do this on non-replicated data
options(datadist='dd')
pain.severity <- inputdata$pain.severity
u <- cr.setup(pain.severity)
# inputdata frame has age, sex with pain.severity
attach(inputdata[u$subs,])  # replicate age, sex
# If age, sex already available, could do age <- age[u$subs] etc., or
# age <- rep(age, u$reps), etc.
y      <- u$y
cohort <- u$cohort
dd     <- datadist(dd, cohort)       # add to dd
f <- lrm(y ~ cohort + age*sex)       # ordinary cont. ratio model
g <- lrm(y ~ cohort*sex + age, x=TRUE,y=TRUE) # allow unequal slopes for
                                     # sex across cutoffs
cal <- calibrate(g, cluster=u$subs, subset=cohort=='all')  
# subs makes bootstrap sample the correct units, subset causes
# Predicted Prob(pain.severity=0) to be checked for calibration
}
}
\keyword{category}
\keyword{models}
\keyword{regression}
\concept{logistic regression model}
\concept{continuation ratio model}
\concept{ordinal logistic model}
\concept{ordinal response}

\eof
\name{datadist}
\alias{datadist}
\alias{print.datadist}
\title{
Distribution Summaries for Predictor Variables
}
\description{
For a given set of variables or a data frame, determines summaries
of variables for effect and plotting ranges, values to adjust to,
and overall ranges
for \code{plot.Design}, \code{summary.Design}, \code{survplot}, and \code{nomogram.Design}.
If \code{datadist} is called before
a model fit and the resulting object pointed to with \code{options(datadist="name")},
the data characteristics will be stored with the fit by \code{Design()}, so
that later predictions and summaries of the fit will not need to access
the original data used in the fit.  Alternatively, you can specify the
values for each variable in the model when using these 3 functions, or
specify the values of some of them and let the functions look up the
remainder (of say adjustmemt levels) from an object created by \code{datadist}.
The best method is probably to run \code{datadist} once before any models are
fitted, storing the distribution summaries for all potential variables.
Adjustment values are \code{0} for binary variables, the most frequent
category (or optionally the first category level)
for categorical (\code{factor}) variables, the middle level for 
\code{ordered factor} variables, and medians for continuous variables.
See descriptions of \code{q.display} and \code{q.effect} for how display and
effect ranges are chosen for continuous variables.
}
\usage{
datadist(\dots, data, q.display, q.effect=c(0.25, 0.75),
         adjto.cat=c('mode','first'), n.unique=10)

\method{print}{datadist}(x, \dots)
# options(datadist="dd")
# used by summary, plot, survplot, sometimes predict
# For dd substitute the name of the result of datadist
}
\arguments{
\item{...}{
a list of variable names, separated by commas, a single data frame, or
a fit with \code{Design} information.  The first element in this list may
also be an object created by an earlier call to \code{datadist}; then
the later variables are added to this \code{datadist} object.
For a fit object, the variables named
in the fit are retrieved from the active data frame or from the location
pointed to by \code{data=frame number} or \code{data="data frame name"}.
For \code{print}, is ignored.
}
\item{data}{
a data frame or a search position.  If \code{data} is a search position,
it is assumed that a data frame is attached in that position, and all
its variables are used.  If you specify both individual variables in
\code{\dots} and \code{data}, the two sets of variables are combined.  Unless the
first argument is a fit object, \code{data} must be an integer.
}
\item{q.display}{
set of two quantiles for computing the range of continuous variables
to use in displaying regression relationships.  Defaults are
\eqn{q} and \eqn{1-q}, where \eqn{q=10/max(n,200)}, and \eqn{n} is the
number of 
non-missing observations.  Thus for \eqn{n<200}, the .05 and .95 quantiles
are used.  For \eqn{n\geq 200}, the \eqn{10^{th}} smallest and
\eqn{10^{th}} largest values are used.  If you specify \code{q.display},
those quantiles are used whether or not \eqn{n<200}.
}
\item{q.effect}{
set of two quantiles for computing the range of continuous variables
to use in estimating regression effects.  Defaults are c(.25,.75),
which yields inter-quartile-range odds ratios, etc.
}
\item{adjto.cat}{
default is \code{"mode"}, indicating that the modal (most frequent) category
for categorical (factor) variables is the adjust-to setting.
Specify \code{"first"} to use the first level of factor variables as the
adjustment values.  In the case of many levels having the maximum
frequency, the first such level is used for \code{"mode"}.
}
\item{n.unique}{
variables having \code{n.unique} or fewer unique values are considered
to be discrete variables in that their unique values are stored in the
\code{values} list.  This will affect how functions such as
\code{nomogram.Design} determine whether variables are discrete or not.
}
\item{x}{result of \code{datadist}}
}
\value{
a list of class \code{"datadist"} with the following components

\item{limits}{
a \eqn{7 \times k} vector, where \eqn{k} is the number of variables.
The 7 rows correspond to the low value for estimating the effect of
the variable, the value to adjust the variable to when examining
other variables, the high value for effect, low value for displaying
the variable, the high value for displaying it, and the overall lowest
and highest values.
}
\item{values}{
a named list, with one vector of unique values for each numeric
variable having no more than \code{n.unique} unique values
}}
\details{
For categorical variables, the 7 limits are set to character strings
(factors) which correspond to
\code{c(NA,adjto.level,NA,1,k,1,k)}, where \code{k} is the number of levels.
For ordered variables with numeric levels, the limits are set to
\code{c(L,M,H,L,H,L,H)}, where \code{L} is the lowest level, \code{M} is the middle
level, and \code{H} is the highest level.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{Design}}, \code{\link{Design.trans}}, \code{\link[Hmisc]{describe}}, \code{\link{plot.Design}}, \code{\link{summary.Design}}
}
\examples{
\dontrun{
d <- datadist(data=1)         # use all variables in search pos. 1
d <- datadist(x1, x2, x3)
page(d)                       # if your options(pager) leaves up a pop-up
                              # window, this is a useful guide in analyses
d <- datadist(data=2)         # all variables in search pos. 2
d <- datadist(data=my.data.frame)
d <- datadist(my.data.frame)  # same as previous.  Run for all potential vars.
d <- datadist(x2, x3, data=my.data.frame)   # combine variables
d <- datadist(x2, x3, q.effect=c(.1,.9), q.display=c(0,1))
# uses inter-decile range odds ratios,
# total range of variables for regression function plots
d <- datadist(d, z)           # add a new variable to an existing datadist
options(datadist="d")         #often a good idea, to store info with fit
f <- ols(y ~ x1*x2*x3)


options(datadist=NULL)        #default at start of session
f <- ols(y ~ x1*x2)
d <- datadist(f)              #info not stored in `f'
d$limits["Adjust to","x1"] <- .5   #reset adjustment level to .5
options(datadist="d")


f <- lrm(y ~ x1*x2, data=mydata)
d <- datadist(f, data=mydata)
options(datadist="d")


f <- lrm(y ~ x1*x2)           #datadist not used - specify all values for
summary(f, x1=c(200,500,800), x2=c(1,3,5))         # obtaining predictions
plot(f, x1=200:800, x2=3)


# Change reference value to get a relative odds plot for a logistic model
d$limits$age[2] <- 30    # make 30 the reference value for age
# Could also do: d$limits["Adjust to","age"] <- 30
fit <- update(fit)   # make new reference value take effect
plot(fit, age=NA, ref.zero=TRUE, fun=exp, ylab='Age=x:Age=30 Odds Ratio')
}
}
\keyword{models}
\keyword{nonparametric}
\keyword{regression}

\eof
\name{fastbw}
\alias{fastbw}
\alias{print.fastbw}
\title{
Fast Backward Variable Selection
}
\description{
Performs a slightly inefficient but numerically stable version of fast
backward elimination on factors, using a method based on Lawless and Singhal
(1978).
This method uses the fitted complete model and computes approximate Wald
statistics by computing conditional (restricted) maximum likelihood estimates
assuming multivariate normality of estimates.
\code{fastbw} deletes factors, not columns of the design matrix. Factors requiring multiple d.f. will be retained or dropped as a group.
The function prints the deletion statistics for each variable in
turn, and prints approximate parameter estimates for the model after
deleting variables.  The approximation is better when the number of
factors deleted is not large.  For \code{ols}, the approximation is exact for
regression coefficients, and standard errors are only off by a factor
equal to the ratio of the mean squared error estimate for the reduced
model to the original mean squared error estimate for the full model.


If the fit was from \code{ols}, \code{fastbw} will compute the usual \eqn{R^2}
statistic for each model.
}
\usage{
fastbw(fit, rule="aic", type="residual", sls=.05, aics=0, eps=1e-9, k.aic=2)

\method{print}{fastbw}(x, digits=4, \dots)
}
\arguments{
\item{fit}{
fit object with \code{Varcov(fit)} defined (e.g., from \code{ols}, \code{lrm}, \code{cph}, \code{psm}, \code{lm}, \code{glm})
}
\item{rule}{
Stopping rule. Defaults to \code{"aic"} for Akaike's information criterion. Use
\code{rule="p"} to use \eqn{P}-values
}
\item{type}{
Type of statistic on which to base the stopping rule. Default is
\code{"residual"} for
the pooled residual chi-square. Use \code{type="individual"} to use Wald
chi-square of individual factors.
}
\item{sls}{
Significance level for staying in a model if \code{rule="p"}.  Default is .05.
}
\item{aics}{
For \code{rule="aic"},
variables are deleted until the chi-square - \code{k.aic} times d.f. falls below \code{aics}.
Default \code{aics} is zero to use the ordinary AIC.  Set \code{aics} to say 10000
to see all variables deleted in order of descending importance.
}
\item{eps}{
Singularity criterion, default is \code{1E-9}.
}
\item{k.aic}{
multiplier to compute AIC, default is 2.  To use BIC, set \code{k.aic} equal
to \eqn{\log(n)}, where \eqn{n} is the effective sample size (number of events
for survival models).
}
\item{x}{result of \code{fastbw}}
\item{digits}{number of significant digits to print}
\item{\dots}{ignored}
}
\value{
a list with the following components:

\item{result}{
matrix of statistics with rows in order of deletion.
}
\item{names.kept}{
names of factors kept in final model.
}
\item{factors.kept}{
the subscripts of factors kept in the final model
}
\item{factors.deleted}{
opposite of \code{factors.kept}.
}
\item{parms.kept}{
column numbers in design matrix corresponding to parameters kept in
the final model.
}
\item{parms.deleted}{
opposite of \code{parms.kept}.
}
\item{coefficients}{
vector of approximate coefficients of reduced model.
}
\item{var}{
approximate covariance matrix for reduced model.
}
\item{Coefficients}{
matrix of coefficients of all models.  Rows correspond to the
successive models examined and columns correspond to the coefficients
in the full model.  For variables not in a particular sub-model (row),
the coefficients are zero.
}}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Lawless, J. F. and Singhal, K. (1978): Efficient screening of nonnormal regression models.  Biometrics 34:318--327.
}
\seealso{
\code{\link{Design}}, \code{\link{ols}}, \code{\link{lrm}}, \code{\link{cph}}, \code{\link{psm}}, \code{\link{validate}}, \code{\link[Hmisc]{solvet}}, \code{\link{Design.Misc}}
}
\examples{
\dontrun{
fastbw(fit, optional.arguments)     # print results
z <- fastbw(fit, optional.args)     # typically used in simulations
lm.fit(X[,z$parms.kept], Y)         # least squares fit of reduced model
}
}
\keyword{models}
\keyword{regression}
\keyword{htest}


\eof
\name{gendata}
\alias{gendata}
\alias{gendata.Design}
\alias{gendata.default}
\title{
Generate Data Frame with Predictor Combinations
}
\description{
If \code{nobs} is not specified, allows user to specify predictor settings
by e.g. \code{age=50, sex="male"}, and any omitted predictors are set to
reference values (default=median for continuous variables, first level
for categorical ones - see \code{datadist}).  If any predictor has more than one
value given, \code{expand.grid} is called to generate all possible combinations
of values.  If \code{nobs} is given, a data frame is first generated which has
\code{nobs} of adjust-to values duplicated.  Then an editor window is opened
which allows the user to subset the variable names down to ones which she
intends to vary (this streamlines the \code{data.ed} step).  Then, if any
predictors kept are discrete and \code{viewvals=TRUE}, a window (using \code{page})
is opened defining the possible values of this subset, to facilitate
data editing.  Then the \code{data.ed} function is invoked to allow interactive
overriding of predictor settings in the \code{nobs} rows.  The subset of
variables are combined with the other predictors which were not
displayed with \code{data.ed}, and a final full data frame is returned.
\code{gendata} is most useful for creating a \code{newdata} data frame to pass
to \code{predict}.
}
\usage{
gendata(fit, \dots)
\method{gendata}{Design}(fit, nobs, viewvals=FALSE,
  editor=.Options$editor, \dots, factors)
\method{gendata}{default}(fit, \dots)
}
\arguments{
\item{fit}{
a fit object created with \code{Design} in effect
}
\item{nobs}{
number of observations to create if doing it interactively using X-windows
}
\item{viewvals}{
if \code{nobs} is given, set \code{viewvals=TRUE} to open a window displaying the
possible value of categorical predictors
}
\item{editor}{
editor to use to edit the list of variable names to consider.
Default is \code{options(editor=)} value (\code{"xedit"} is this is not specified
by \code{using.X()==TRUE}.
}
\item{...}{
predictor settings, if \code{nobs} is not given. 
}
\item{factors}{
a list containing predictor settings with their names.  This is an
alternative to specifying the variables separatey in \dots .
}}
\value{
a data frame with all predictors, and an attribute \code{names.subset} if
\code{nobs} is specified.  This attribute contains the vector of variable
names for predictors which were passed to \code{data.ed} and hence were
allowed to vary.  If neither \code{nobs} nor any predictor settings were
given, returns a data frame with adjust-to values.
}
\section{Side Effects}{
optionally writes to the terminal, opens X-windows, and generates a
temporary file using \code{sink}.
}
\details{
if you have a variable in \code{\dots} that is named \code{n, no, nob, nob}, add
\code{nobs=FALSE} to the invocation to prevent that variable from being misrecognized
as \code{nobs}
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{predict.Design}}, \code{\link{survest.cph}}, \code{\link{survest.psm}}, \code{\link{Design.Misc}}, \code{\link{expand.grid}}, \code{\link{data.entry}}, \code{\link{page}},
\code{\link{print.datadist}}, \code{\link{edit.data.frame}},
\code{\link{plot.Design}}
}
\examples{
set.seed(1)
age <- rnorm(200, 50, 10)
sex <- factor(sample(c('female','male'),200,TRUE))
race <- factor(sample(c('a','b','c','d'),200,TRUE))
y <- sample(0:1, 200, TRUE)
dd <- datadist(age,sex,race)
options(datadist="dd")
f <- lrm(y ~ age*sex + race)
gendata(f)
gendata(f, age=50)
d <- gendata(f, age=50, sex="female")  # leave race=reference category
d <- gendata(f, age=c(50,60), race=c("b","a"))  # 4 obs.
d$Predicted <- predict(f, d, type="fitted")
d      # Predicted column prints at the far right
options(datadist=NULL)
\dontrun{
d <- gendata(f, nobs=5, view=TRUE)        # 5 interactively defined obs.
d[,attr(d,"names.subset")]             # print variables which varied
predict(f, d)
}
}
\keyword{methods}
\keyword{models}
\keyword{regression}
\keyword{manip}

\eof
\name{glmD}
\alias{glmD}
\alias{print.glmD}
\title{Design Version of glm}
\description{
This function saves \code{Design} attributes with the fit object so that
\code{anova.Design}, \code{plot.Design}, etc. can be used just as with
\code{ols} and other fits.  No \code{validate} or \code{calibrate}
methods exist for \code{glmD} though.
}
\usage{
glmD(formula, family = gaussian, data = list(), weights = NULL, subset =
NULL, na.action = na.fail, start = NULL, offset = NULL, control =
glm.control(...), model = TRUE, method = "glm.fit", x = FALSE, y = TRUE,
contrasts = NULL, \dots)

\method{print}{glmD}(x, digits=4, \dots)
}
\arguments{
  \item{formula}{}
  \item{family}{}
  \item{data}{}
  \item{weights}{}
  \item{subset}{}
  \item{na.action}{}
  \item{start}{}
  \item{offset}{}
  \item{control}{}
  \item{model}{}
  \item{method}{}
  \item{x}{}
  \item{y}{}
  \item{contrasts}{see \code{\link{glm}}; for \code{print}, \code{x} is
	the result of \code{glmD}}
  \item{\dots}{ignored for \code{print}}
  \item{digits}{number of significant digits to print}
}
\value{a fit object like that produced by \code{\link{glm}} but with
  \code{Design} attributes and a \code{class} of \code{"Design"},
  \code{"glmD"}, and \code{"glm"} or \code{"glm.null"}.
}
\seealso{\code{\link{glm}},\code{\link{Design}}}
\examples{
## Dobson (1990) Page 93: Randomized Controlled Trial :
counts <- c(18,17,15,20,10,20,25,13,12)
outcome <- gl(3,1,9)
treatment <- gl(3,3)
f <- glm(counts ~ outcome + treatment, family=poisson())
f
anova(f)
summary(f)
f <- glmD(counts ~ outcome + treatment, family=poisson())
# could have had rcs( ) etc. if there were continuous predictors
f
anova(f)
summary(f, outcome=c('1','2','3'), treatment=c('1','2','3'))
}
\keyword{models}
\keyword{regression}

\eof
\name{glsD}
\alias{glsD}
\alias{print.glsD}
\title{Fit Linear Model Using Generalized Least Squares}
\usage{
glsD(model, data, correlation, weights, subset, method, na.action,
    control, verbose, B=0, dupCluster=FALSE, pr=FALSE,
    opmeth=c('optimize','optim'))

\method{print}{glsD}(x, digits=4, \dots)
}
\arguments{
 \item{model}{a two-sided linear formula object describing the
    model, with the response on the left of a \code{~} operator and the
    terms, separated by \code{+} operators, on the right.}
 \item{data}{an optional data frame containing the variables named in
   \code{model}, \code{correlation}, \code{weights}, and
   \code{subset}. By default the variables are taken from the
   environment from which \code{gls} is called.}
 \item{correlation}{an optional \code{corStruct} object describing the
   within-group correlation structure. See the documentation of
   \code{corClasses} for a description of the available \code{corStruct}
   classes. If a grouping variable is to be used, it must be specified in
   the \code{form} argument to the \code{corStruct}
   constructor. Defaults to \code{NULL}, corresponding to uncorrelated 
   errors.}  
 \item{weights}{an optional \code{varFunc} object or one-sided formula
   describing the within-group heteroscedasticity structure. If given as
   a formula, it is used as the argument to \code{varFixed},
   corresponding to fixed variance weights. See the documentation on
   \code{varClasses} for a description of the available \code{varFunc}
   classes. Defaults to \code{NULL}, corresponding to homoscesdatic
   errors.} 
 \item{subset}{an optional expression indicating which subset of the rows of
   \code{data} should  be  used in the fit. This can be a logical
   vector, or a numeric vector indicating which observation numbers are
   to be included, or a  character  vector of the row names to be
   included.  All observations are included by default.}
 \item{method}{a character string.  If \code{"REML"} the model is fit by
   maximizing the restricted log-likelihood.  If \code{"ML"} the
   log-likelihood is maximized.  Defaults to \code{"REML"}.}
 \item{na.action}{a function that indicates what should happen when the
   data contain \code{NA}s.  The default action (\code{na.fail}) causes
   \code{gls} to print an error message and terminate if there are any
   incomplete observations.}
 \item{control}{a list of control values for the estimation algorithm to
   replace the default values returned by the function \code{glsControl}.
   Defaults to an empty list.}
 \item{verbose}{an optional logical value. If \code{TRUE} information on
   the evolution of the iterative algorithm is printed. Default is
   \code{FALSE}.}
 \item{B}{number of bootstrap resamples to fit and store, default is
   none}
 \item{dupCluster}{set to \code{TRUE} to have \code{glsD} when
   bootstrapping to consider multiply-sampled clusters as if they were
   one large cluster when fitting using the \code{gls} algorithm}
 \item{pr}{set to \code{TRUE} to show progress of bootstrap resampling}
 \item{opmeth}{specifies whether the \code{optimize} or the \code{optim}
   function is to be used for optimization}
 \item{x}{the result of \code{glsD}}
 \item{digits}{number of significant digits to print}
 \item{\dots}{ignored}
}
\description{
  This function fits a linear model using generalized least
  squares. The errors are allowed to be correlated and/or have unequal
  variances.  \code{glsD} is a slightly enhanced version of the
  Pinheiro and Bates \code{glsD} function in the \code{nlme} package to
  make it easy to use with the Design library and to implement cluster
  bootstrapping (primarily for nonparametric estimates of the
  variance-covariance matrix of the parameter estimates and for
  nonparametric confidence limits of correlation parameters).
}
\value{
  an object of classes \code{glsD}, \code{Design}, and \code{gls}
  representing the linear model
  fit. Generic functions such as \code{print}, \code{plot}, and 
  \code{summary} have methods to show the results of the fit. See
  \code{glsObject} for the components of the fit. The functions
  \code{resid}, \code{coef}, and \code{fitted} can be used to extract
  some of its components.  \code{glsD} returns the following components
  not returned by \code{gls}: \code{Design}, \code{assign},
  \code{formula}, \code{opmeth} (see arguments), \code{B} (see
  arguments), \code{bootCoef} (matrix of \code{B} bootstrapped
  coefficients), \code{boot.Corr} (vector of bootstrapped correlation
  parameters), \code{Nboot} (vector of total sample size used in each
  bootstrap (may vary if have unbalanced clusters), and \code{var}
  (sample variance-covariance matrix of bootstrapped coefficients).
}
\references{
  Pinheiro J, Bates D (2000): Mixed effects models in S and S-Plus.  New
  York: Springer-Verlag.
}
\author{Jose Pinheiro \email{jcp@research.bell-labs.com},
  Douglas Bates \email{bates@stat.wisc.edu},
  Frank Harrell \email{f.harrell@vanderbilt.edu},
  Patrick Aboyoun \email{aboyoun@insightful.com}
} 
\seealso{
  \code{\link{gls}}
  \code{\link{glsControl}}, \code{\link{glsObject}},
  \code{\link{varFunc}}, \code{\link{corClasses}}, \code{\link{varClasses}}
}
\examples{
\dontrun{
ns  <- 20  # no. subjects
nt  <- 10  # no. time points/subject
B   <- 10  # no. bootstrap resamples
           # usually do 100 for variances, 1000 for nonparametric CLs
rho <- .5  # AR(1) correlation parameter
V <- matrix(0, nrow=nt, ncol=nt)
V <- rho^abs(row(V)-col(V))   # per-subject correlation/covariance matrix

d <- expand.grid(tim=1:nt, id=1:ns)
d$trt <- factor(ifelse(d$id <= ns/2, 'a', 'b'))
true.beta <- c(Intercept=0,tim=.1,'tim^2'=0,'trt=b'=1)
d$ey  <- true.beta['Intercept'] + true.beta['tim']*d$tim +
  true.beta['tim^2']*(d$tim^2) +  true.beta['trt=b']*(d$trt=='b')
set.seed(13)
library(MASS)   # needed for mvrnorm
d$y <- d$ey + as.vector(t(mvrnorm(n=ns, mu=rep(0,nt), Sigma=V)))

dd <- datadist(d); options(datadist='dd')
# library(nlme)  # S-Plus: library(nlme3) or later
f <- glsD(y ~ pol(tim,2) + trt, correlation=corCAR1(form= ~tim | id),
          data=d, B=B)
f
f$var      # bootstrap variances
f$varBeta  # original variances
summary(f)
anova(f)
plot(f, tim=NA, trt=NA)
# v <- Variogram(f, form=~tim|id, data=d)
}
}
\keyword{models}

\eof
\name{groupkm}
\alias{groupkm}
\title{
Kaplan-Meier Estimates vs. a Continuous Variable
}
\description{
Function to divide \code{x} (e.g. age, or predicted survival at time \code{u} created by
\code{survest}) into \code{g} quantile groups, get Kaplan-Meier estimates at time \code{u}
(a scaler), and to return a matrix with columns \code{x}=mean \code{x} in
quantile, \code{n}=number of subjects, \code{events}=no. events, and 
\code{KM}=K-M survival at time \code{u},
\code{std.err} = s.e. of log-log K-M.  Confidence intervals are based on 
log-log S(t).
Instead of supplying \code{g}, the user can supply the minimum number of subjects
to have
in the quantile group (\code{m}, default=50).
If \code{cuts} is given (e.g. \code{cuts=c(0,.1,.2,\dots,.9,.1)}), it overrides \code{m} and \code{g}.
Calls Therneau's \code{survfit.km} to get Kaplan-Meiers estimates and standard
errors.
}
\usage{
groupkm(x, Srv, m=50, g, cuts, u, 
        pl=FALSE, loglog=FALSE, conf.int=.95, xlab, ylab,
        lty=1, add=FALSE, cex.subtitle=.7, \dots)
}
\arguments{
\item{x}{variable to stratify}
\item{Srv}{
a "Surv" object - n x 2 matrix containing survival time and event/censoring
1/0 indicator.  Units of measurement come from the "units" attribute
of the survival time variable.  "Day" is the default.
}
\item{m}{
desired minimum number of observations in a group
}
\item{g}{
number of quantile groups
}
\item{cuts}{
actual cuts in \code{x}, e.g. \code{c(0,1,2)} to use [0,1), [1,2].
}
\item{u}{
time for which to estimate survival
}
\item{pl}{
TRUE to plot results
}
\item{loglog}{
set to \code{TRUE} to plot \code{log(-log(survival))} instead of survival
}
\item{conf.int}{
defaults to \code{.95} for 0.95 confidence bars.  Set to \code{FALSE} to suppress bars.
}
\item{xlab}{
if \code{pl=TRUE}, is x-axis label.  Default is \code{label(x)} or name of calling argument
}
\item{ylab}{
if \code{pl=TRUE}, is y-axis label.  Default is constructed from \code{u} and time \code{units}
attribute.
}
\item{lty}{
line time for primary line connecting estimates
}
\item{add}{
set to \code{TRUE} if adding to an existing plot
}
\item{cex.subtitle}{
character size for subtitle. Default is \code{.7}.  Use \code{FALSE} to suppress subtitle.
}
\item{...}{
plotting parameters to pass to the plot and errbar functions
}}
\value{
matrix with columns named \code{x} (mean predictor value in interval), \code{n} (sample size
in interval), \code{events} (number of events in interval), \code{KM} (Kaplan-Meier
estimate), \code{std.err} (standard error of log-log \code{KM})
}
\seealso{
\code{\link[survival]{survfit.km}}, \code{\link[Hmisc]{errbar}}, \code{\link[Hmisc]{cut2}}, \code{\link{Surv}}, \code{\link[Hmisc]{units}}
}
\examples{
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50))
d.time <- -log(runif(n))/h
label(d.time) <- 'Follow-up Time'
e <- ifelse(d.time <= cens,1,0)
d.time <- pmin(d.time, cens)
units(d.time) <- "Year"
groupkm(age, Surv(d.time, e), g=10, u=5, pl=TRUE)
#Plot 5-year K-M survival estimates and 0.95 confidence bars by 
#decile of age.  If omit g=10, will have >= 50 obs./group.
}
\keyword{survival}
\keyword{nonparametric}
\concept{grouping}
\concept{stratification}
\concept{aggregation}

\eof
\name{hazard.ratio.plot}
\alias{hazard.ratio.plot}
\title{
Hazard Ratio Plot
}
\description{
The \code{hazard.ratio.plot} function repeatedly estimates Cox
regression coefficients and confidence limits within time intervals.
The log hazard ratios are plotted against the mean failure/censoring
time within the interval. Unless \code{times} is specified, the number of
time intervals will be \eqn{\max(round(d/e),2)}, where \eqn{d} is the
total number 
of events in the sample. Efron's likelihood is used for estimating
Cox regression coefficients (using \code{coxph.fit}).  In the case of
tied failure times, some intervals may have a point in common.
}
\usage{
hazard.ratio.plot(x, Srv, which, times=, e=30, subset,
                  conf.int=.95, legendloc=NULL, smooth=TRUE, pr=FALSE, pl=TRUE,
                  add=FALSE, ylim, cex=.5, xlab="t", ylab, antilog=FALSE, \dots)
}
\arguments{
\item{x}{
a vector or matrix of predictors
}
\item{Srv}{
a Surv object
}
\item{which}{
a vector of column numbers of \code{x} for which to estimate hazard
ratios across time and make plots.
The default is to do so for all predictors.  Whenever
one predictor is displayed, all other predictors in the \code{x} matrix
are adjusted for (with a separate adjustment form for each time interval).
}
\item{times}{
optional vector of time interval endpoints.
Example: \code{times=c(1,2,3)} uses intervals \code{[0,1), [1,2), [2,3), [3+)}.
If times is omitted, uses intervals containing \code{e} events
}
\item{e}{
number of events per time interval if times not given
}
\item{subset}{
vector used for subsetting the entire analysis,
 e.g. \code{subset=sex=="female"}
}
\item{conf.int}{
confidence interval coverage
}
\item{legendloc}{
location for legend. Omit to use mouse, \code{"none"} for none,
 \code{"ll"} for lower left of graph, or actual x and y coordinates (e.g.
\code{c(2,3)})
}
\item{smooth}{
also plot the super--smoothed version of the log hazard ratios
}
\item{pr}{
defaults to \code{FALSE} to suppress printing of individual Cox fits
}
\item{pl}{
defaults to \code{TRUE} to plot results
}
\item{add}{
add this plot to an already existing plot
}
\item{ylim}{
vector of \code{y}-axis limits. Default is computed to include confidence bands.
}
\item{cex}{
character size for legend information, default is 0.5
}
\item{xlab}{
label for \code{x}-axis, default is \code{"t"}
}
\item{ylab}{
label for \code{y}-axis, default is \code{"Log Hazard Ratio"} or \code{"Hazard Ratio"},
depending on \code{antilog}.
}
\item{antilog}{
default is \code{FALSE}. Set to \code{TRUE} to plot anti-log, i.e., hazard ratio.
}
\item{...}{
optional graphical parameters
}}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[survival]{cox.zph}}, \code{\link{residuals.cph}}, \code{\link[survival]{coxph.fit}}, \code{\link{cph}}, \code{\link[survival]{coxph}}, \code{\link[survival]{Surv}}
}
\examples{
n <- 500
set.seed(1)
age <- 50 + 12*rnorm(n)
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50))
d.time <- -log(runif(n))/h
label(d.time) <- 'Follow-up Time'
e <- ifelse(d.time <= cens,1,0)
d.time <- pmin(d.time, cens)
units(d.time) <- "Year"
hazard.ratio.plot(age, Surv(d.time,e), e=20, legendloc='ll')
}
\keyword{survival}

\eof
\name{ie.setup}
\alias{ie.setup}
\title{
Intervening Event Setup
}
\description{
Creates several new variables which help set up a dataset for modeling
with \code{cph} or \code{coxph} when there is a single binary time-dependent
covariable which turns on at a given time, and stays on.  This is
typical when analyzing the impact of an intervening event.
\code{ie.setup} creates a \code{Surv} object using the start time, stop time
format.  It also creates a binary indicator for the intervening event,
and a variable called \code{subs} that is useful when \code{attach}-ing a dataframe.
\code{subs} has observation numbers duplicated for subjects having an
intervening event, so those subject's baseline covariables (that are
not time-dependent) can be duplicated correctly.
}
\usage{
ie.setup(failure.time, event, ie.time, break.ties=FALSE)
}
\arguments{
\item{failure.time}{
a numeric variable containing the event or censoring times for the
terminating event
}
\item{event}{
a binary (0/1) variable specifying whether observations had the
terminating event (event=1) or were censored (event=0)
}
\item{ie.time}{
intervening event times.  For subjects having no intervening events,
the corresponding values of ie.time must be NA.
}
\item{break.ties}{
Occasionally intervening events are recorded as happening at exactly
the same time as the termination of follow-up for some subjects.
The \code{Surv} function will not allow this.  To randomly break the ties
by subtracting a random number from such tied intervening event times,
specify \code{break.ties=TRUE}.  The random number is uniform between zero and
the minimum difference between any two untied \code{failure.time}s.
}}
\value{
a list with components \code{S, ie.status, subs, reps}.  \code{S} is a \code{Surv}
object containing start and stop times for intervals of observation, 
along with event indicators.  \code{ie.status} is one if the intervening
event has occurred at the start of the interval, zero otherwise.
\code{subs} is a vector of subscripts that can be used to replicate other
variables the same way \code{S} was replicated.  \code{reps} specifies how many
times each original observation was replicated.  \code{S, ie.status, subs} are
all the same length (at least the number of rows for \code{S} is) and are longer than the original \code{failure.time} vector.
\code{reps} is the same length as the original \code{failure.time} vector.
The \code{subs} vector is suitable for passing to \code{validate.lrm} or \code{calibrate},
which pass this vector under the name \code{cluster} on to \code{predab.resample} so that bootstrapping can be
done by sampling with replacement from the original subjects rather than
from the individual records created by \code{ie.setup}.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{cph}}, \code{\link{coxph}}, \code{\link{Surv}}, \code{\link{cr.setup}}, \code{\link{predab.resample}}
}
\examples{
failure.time <- c(1 ,   2,   3)
event        <- c(1 ,   1,   0)
ie.time      <- c(NA, 1.5, 2.5)


z <- ie.setup(failure.time, event, ie.time)
S <- z$S
S
ie.status <- z$ie.status
ie.status
z$subs
z$reps
\dontrun{
attach(input.data.frame[z$subs,])   #replicates all variables
f <- cph(S ~ age + sex + ie.status)
# Instead of duplicating rows of data frame, could do this:
attach(input.data.frame)
z <- ie.setup(failure.time, event, ie.time)
s <- z$subs
age <- age[s]
sex <- sex[s]
f <- cph(S ~ age + sex + ie.status)
}
}
\keyword{survival}
% Converted by Sd2Rd version 1.21.





\eof
\name{latex.Design}
\alias{latexDesign}
\alias{latex.Design}
\alias{latex.bj}
\alias{latex.glmD}
\alias{latex.glsD}
\alias{latex}
\title{LaTeX Representation of a Fitted Model}
\description{
Creates a file containing a LaTeX representation of the fitted model.  For
model-specific typesetting there is \code{latex.lrm}, \code{latex.cph},
\code{latex.psm} and \code{latex.ols}. \code{latex.cph} has some
arguments that are specific to \code{cph} models.  These routines work
with the \code{display} package from statlib to display and print the
formatted model fits.  \code{latexDesign} is the core function which is
called internally by \code{latex.Design} (which is called by
\code{latex.cph}, \code{latex.ols}, etc.).
}
\synopsis{
latexDesign(object, file=paste(first.word(deparse(substitute(object))),".tex",sep=""),
             append=FALSE, which=1:p, varnames, columns=65, prefix=NULL, inline=FALSE,
             before=if(inline)"" else "& &", intercept, pretrans=TRUE,
             digits=.Options$digits)

}
\usage{
latex(object, title,
      file=paste(first.word(deparse(substitute(object))),'tex',sep='.'), \dots)
}
\arguments{
\item{object}{
a fit object created by a fitting function in the \code{Design} series
}
\item{title}{ignored}
\item{file}{
name of \code{.tex} file to create, default is first word of argument
\code{object} with \code{".tex"} added.  Set to \code{""} to send LaTeX
output to standard output.
}
\item{\dots}{further arguments, including
  \describe{
	\item{\code{append}}{whether or not to append to an existing file}
	\item{\code{which}}{
	  a vector of subcripts (corresponding to \code{object$Design$name})
	  specifying a submodel to print. Default is to describe the whole
	  model. 
	  \code{which} can also be a vector of character strings specifying the
	  factor names to print. Enough of each string is needed to ensure
	  a unique match. Names for interaction effects are of the form
	  \code{"age * sex"}. For any interaction effect for which you do not
	  request main effects, the main effects will be added to \code{which}.
	  When \code{which} is given, the model structural statement is not
	  included. In this case, intercepts are not included either.
	}
	\item{\code{varnames}}{
	  variable names to substitute for non-interactions. Order must correspond
	  to \code{object$Design$name} and interactions must be omitted.
	  Default is
	  \code{object$Design$name[object$Design$assume.code!=9]}. \code{varnames} can contain any LaTeX commands such as subscripts and "\\\\\\\\frac"   
	  (all "\\" must be quadrupled.)
	  Any "/" must be preceeded by "\\\\" (2, not 4 backslashes).
	  Elements of \code{varnames} for interactions are ignored; they can be
	  set to any value.
	}
	\item{\code{columns}}{
	  maximum number of columns of printing characters to allow before
	  outputting a LaTeX newline command
	}
	\item{\code{prefix}}{
	  if given, a LaTeX \\lefteqn command of the form \code{\\lefteqn\{prefix =\} \\\\}
	  will be inserted to print a left-hand-side of the equation.
	}
	\item{\code{inline}}{
	  Set to \code{TRUE} to create text for insertion in an in-line equation. This
	  text contains only the expansion of X beta, and is not surrounded by
	  \code{"$"}.
	}
	\item{\code{before}}{
	  a character string to place before each line of output. Use the default
	  for a LaTeX \code{eqnarray} environment.
	}
	\item{\code{intercept}}{
	  a special intercept value to include that is not part of the standard
	  model parameters (e.g., centering constant in Cox model). Only allowed
	  in the \code{latex.Design} rendition.
	}
	\item{\code{pretrans}}{
	  if any spline or polynomial-expanded variables are themselves
	  transformed, a table of pre-transformations will be formed unless
	  \code{pretrans=FALSE}.
	}
	\item{\code{digits}}{number of digits of precision to use in formatting
	  coefficients and other numbers}
	}
	Other arguments in '...' will be passed to \code{latex.default}.
  }
}
\value{a file name of class \code{"latex"}}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[Hmisc]{latex}}, \code{\link[Hmisc]{rcspline.restate}},
\code{\link{Design}}
}
\examples{
\dontrun{
f <- lrm(death ~ rcs(age)+sex)
w <- latex(f)
w     # displays, using e.g. xdvi
latex(f, file="")    # send LaTeX code to screen
}
}
\keyword{models}
\keyword{regression}
\keyword{character}
\keyword{methods}
\keyword{interface}

\eof
\name{latex.cph}
\alias{latex.cph}
\alias{latex.lrm}
\alias{latex.ols}
\alias{latex.pphsm}
\alias{latex.psm}
\title{
LaTeX Representation of a Fitted Cox Model
}
\description{
Creates a file containing a LaTeX representation of
the fitted model. 
}
\usage{
\method{latex}{cph}(object, title,
      file=paste(first.word(deparse(substitute(object))),".tex",sep=""),
      append=FALSE, surv=TRUE, maxt=FALSE, which, varnames, columns=65, 
      inline=FALSE, before=if(inline)"" else "& &", dec=3,
      pretrans=TRUE, caption, \dots) # for cph fit

\method{latex}{lrm}(object, title, file, append, which, varnames,
columns, inline, before, pretrans, caption, \dots) # for lrm fit

\method{latex}{ols}(object, title, file, append, which, varnames,
columns, inline, before, pretrans, caption, \dots) # ols fit

\method{latex}{pphsm}(object, title, file, append, which, varnames,
columns, inline, before, pretrans, caption, \dots) # pphsm fit

\method{latex}{psm}(object, title, file, append, which, varnames,
columns, inline, before, pretrans, caption, \dots) # psm fit
}
\arguments{
\item{object}{
a fit object created by a \code{Design} fitting function.
}
\item{title}{ignored}
\item{file}{}
\item{append}{see \code{\link[Hmisc]{latex.default}}}
\item{surv}{
if \code{surv=TRUE} was specified to \code{cph}, the underlying survival
probabilities from \code{object$surv.summary} will be placed in a table
unless \code{surv=FALSE}.
}
\item{maxt}{
if the maximum follow-up time in the data (\code{object$maxtime}) exceeds the
last entry in \code{object$surv.summary}, underlying survival estimates at
\code{object$maxtime} will be added to the table if \code{maxt=TRUE}.
}
\item{which}{}
\item{varnames}{}
\item{columns}{}
\item{inline}{}
\item{before}{}
\item{dec}{}
\item{pretrans}{see \code{\link[Hmisc]{latex.default}}}
\item{caption}{a character string specifying a title for the equation to
  be centered and typeset in bold face.   Default is no title.
  }
\item{\dots}{ignored}
}
\value{
the name of the created file, with class \code{c("latex","file")}.  This
object works with latex viewing and printing commands in Hmisc.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{latex.Design}}, \code{\link[Hmisc]{rcspline.restate}}, \code{\link[Hmisc]{latex.default}}
}
\examples{
\dontrun{
units(ftime) <- "Day"
f <- cph(Surv(ftime, death) ~ rcs(age)+sex, surv=TRUE, time.inc=60)
w <- latex(f)  #Interprets fitted model and makes table of S0(t)
               #for t=0,60,120,180,\dots   Creates file f.tex
w              #displays image, if viewer installed
latex(f,file="")   # send LaTeX code to the screen
}
}
\keyword{regression}
\keyword{character}
\keyword{survival}
\keyword{interface}
\keyword{models}

\eof
\name{lrm}
\alias{lrm}
\title{
Logistic Regression Model
}
\description{
Fit binary and proportional odds ordinal
logistic regression models using maximum likelihood estimation or
penalized maximum likelihood estimation.  See \code{cr.setup} for how to
fit forward continuation ratio models with \code{lrm}.
}
\usage{
lrm(formula, data, subset, na.action=na.delete, method="lrm.fit",
    model=FALSE, x=FALSE, y=FALSE, linear.predictors=TRUE, se.fit=FALSE, 
    penalty=0, penalty.matrix, tol=1e-7, 
    strata.penalty=0, var.penalty=c('simple','sandwich'),
    weights, normwt, \dots)
}
\arguments{
\item{formula}{
a formula object. An \code{offset} term can be included. The offset causes
fitting of a model such as \eqn{logit(Y=1) = X\beta + W}, where \eqn{W} is the
offset variable having no estimated coefficient.
The response variable can be any data type; \code{lrm} converts it
in alphabetic or numeric order to an S factor variable and
recodes it 0,1,2,\dots internally. 
}
\item{data}{
data frame to use. Default is the current frame.
}
\item{subset}{
logical expression or vector of subscripts defining a subset of
observations to analyze
}
\item{na.action}{
function to handle \code{NA}s in the data. Default is \code{na.delete}, which
deletes any observation having response or predictor missing, while
preserving the attributes of the predictors and maintaining frequencies
of deletions due to each variable in the model.  
This is usually specified using \code{options(na.action="na.delete")}.
}
\item{method}{
name of fitting function. Only allowable choice at present is \code{lrm.fit}.
}
\item{model}{
causes the model frame to be returned in the fit object
}
\item{x}{
causes the expanded design matrix (with missings excluded)
to be returned under the name \code{x}.
}
\item{y}{
causes the response variable (with missings excluded) to be returned
under the name \code{y}.
}
\item{linear.predictors}{
causes the predicted X beta (with missings excluded) to be returned
under the name \code{linear.predictors}. When the response variable has
more than two levels, only the first intercept is used.
}
\item{se.fit}{
causes the standard errors of the fitted values to be returned under
the name \code{se.fit}.
}
\item{penalty}{
The penalty factor subtracted from the log likelihood is
\eqn{0.5 \beta' P \beta}, where \eqn{\beta} is the vector of regression
coefficients other than intercept(s), and \eqn{P} is \code{penalty
  factors * penalty.matrix} and \code{penalty.matrix} is
defined below.  The default is \code{penalty=0} implying that ordinary 
unpenalized maximum likelihood estimation is used.
If \code{penalty} is a scalar, it is assumed to be a penalty factor that
applies 
to all non-intercept parameters in the model.  Alternatively, specify a
list to penalize different types of model terms by differing amounts.
The elements in this list are named \code{simple, nonlinear, interaction} and
\code{nonlinear.interaction}.  If you omit elements on the right of this
series, values are inherited from elements on the left.  Examples:
\code{penalty=list(simple=5, nonlinear=10)} uses a penalty factor of 10
for nonlinear or interaction terms.  
\code{penalty=list(simple=0, nonlinear=2, nonlinear.interaction=4)} does not
penalize linear main effects, uses a penalty factor of 2 for nonlinear or
interaction effects (that are not both), and 4 for nonlinear interaction
effects.
}
\item{penalty.matrix}{
specifies the symmetric penalty matrix for non-intercept terms.
The default matrix for continuous predictors has
the variance of the columns of the design matrix in its diagonal elements
so that the penalty to the log likelhood is unitless.  For main effects
for categorical predictors with \eqn{c} categories, the rows and columns of
the matrix contain a \eqn{c-1 \times c-1} sub-matrix that is used to
compute the 
sum of squares about the mean of the \eqn{c} parameter values (setting the
parameter to zero for the reference cell) as the penalty component
for that predictor.  This makes the penalty independent of the choice of
the reference cell.  If you specify \code{penalty.matrix}, you may set
the rows and columns for certain parameters to zero so as to not
penalize those parameters.
Depending on \code{penalty}, some elements of \code{penalty.matrix} may
be overridden automatically by setting them to zero.
The penalty matrix that is used in the actual fit is 
\eqn{penalty \times diag(pf) \times penalty.matrix \times diag(pf)},
where \eqn{pf} is the vector 
of square roots of penalty factors computed from \code{penalty} by
\code{Penalty.setup} in \code{Design.Misc}.  If you specify \code{penalty.matrix}
you must specify a nonzero value of \code{penalty} or no penalization will be
done.
}
\item{tol}{singularity criterion (see \code{lrm.fit})}
\item{strata.penalty}{scalar penalty factor for the stratification
  factor, for the experimental \code{strat} variable}
\item{var.penalty}{
the type of variance-covariance matrix to be stored in the \code{var}
component of the fit when penalization is used.  The default is the
inverse of the penalized information matrix.  Specify
\code{var.penalty="sandwich"} to use the sandwich estimator (see below
under \code{var}), which limited simulation studies have shown yields
variances estimates that are too low.
}
\item{weights}{
  a vector (same length as \code{y}) of possibly fractional case weights
}
\item{normwt}{
 set to code{TRUE} to scale \code{weights} so they sum to the length of
 \code{y}; useful for sample surveys as opposed to the default of
 frequency weighting 
 }
\item{\dots}{
arguments that are passed to \code{lrm.fit}.
}}
\value{
The returned fit object of \code{lrm} contains the following components in addition
to the ones mentioned under the optional arguments.

\item{call}{
calling expression
}
\item{freq}{
table of frequencies for \code{Y} in order of increasing \code{Y}
}
\item{stats}{
vector with the following elements: number of observations used in the
fit, maximum absolute value of first
derivative of log likelihood, model likelihood ratio
\eqn{\chi^2}{chi-square}, d.f., 
\eqn{P}-value, \eqn{c} index (area under ROC curve), Somers' \eqn{D_{xy}},
Goodman-Kruskal \eqn{\gamma}{gamma}, Kendall's \eqn{\tau_a}{tau-a} rank
correlations 
between predicted probabilities and observed response, the
Nagelkerke \eqn{R^2} index, and the Brier score computed with respect to
\eqn{Y >} its lowest level. Probabilities are rounded to the nearest 0.002 
in the computations or rank correlation indexes.
In the case of penalized estimation, the \code{"Model L.R."} is computed
without the penalty factor, and \code{"d.f."} is the effective d.f. from
Gray's (1992) Equation 2.9.
The \eqn{P}-value uses this corrected model
L.R. \eqn{\chi^2}{chi-square} and corrected d.f. 
The score chi-square statistic uses first derivatives which contain
penalty components.
}
\item{fail}{
set to \code{TRUE} if convergence failed (and \code{maxiter>1})
}
\item{coefficients}{
estimated parameters
}
\item{var}{
estimated variance-covariance matrix (inverse of information matrix).
If \code{penalty>0}, \code{var} is either the inverse of the penalized
information matrix (the default, if \code{var.penalty="simple"}) or the
sandwich-type variance - covariance
matrix estimate (Gray Eq. 2.6) if \code{var.penalty="sandwich"}.  For the
latter case the simple information-matrix - based variance
matrix is returned under the name \code{var.from.info.matrix}.
}
\item{effective.df.diagonal}{
is returned if \code{penalty>0}.  It is the vector whose sum is the effective
d.f. of the model (counting intercept terms).
}
\item{u}{
vector of first derivatives of log-likelihood
}
\item{deviance}{
-2 log likelihoods (counting penalty components)
When an offset variable is present, three
deviances are computed: for intercept(s) only, for
intercepts+offset, and for intercepts+offset+predictors.
When there is no offset variable, the vector contains deviances for
the intercept(s)-only model and the model with intercept(s) and predictors.
}
\item{est}{
vector of column numbers of \code{X} fitted (intercepts are not counted)
}
\item{non.slopes}{
number of intercepts in model
}
\item{penalty}{
see above
}
\item{penalty.matrix}{
the penalty matrix actually used in the estimation
}}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Le Cessie S, Van Houwelingen JC: Ridge estimators in logistic regression.
Applied Statistics 41:191--201, 1992.


Verweij PJM, Van Houwelingen JC: Penalized likelihood in Cox regression.
Stat in Med 13:2427--2436, 1994.


Gray RJ: Flexible methods for analyzing survival data using splines,
with applications to breast cancer prognosis.  JASA 87:942--951, 1992.


Shao J: Linear model selection by cross-validation.  JASA 88:486--494, 1993.


Verweij PJM, Van Houwelingen JC: Crossvalidation in survival analysis.
Stat in Med 12:2305--2314, 1993.


Harrell FE: Model uncertainty, penalization, and parsimony.  ISCB
Presentation on UVa Web page, 1998.
}
\seealso{
\code{\link{lrm.fit}}, \code{\link{predict.lrm}}, \code{\link{Design.trans}}, \code{\link{Design}}, \code{\link{glm}}, \code{\link{latex.lrm}},
\code{\link{residuals.lrm}}, \code{\link[Hmisc]{na.delete}}, \code{\link[Hmisc]{na.detail.response}}, \code{\link{naresid}},
\code{\link{pentrace}}, \code{\link{Design.Misc}}, \code{\link{vif}}, \code{\link{cr.setup}}, \code{\link{predab.resample}},
\code{\link{validate.lrm}}, \code{\link{calibrate}}
}
\examples{
#Fit a logistic model containing predictors age, blood.pressure, sex
#and cholesterol, with age fitted with a smooth 5-knot restricted cubic 
#spline function and a different shape of the age relationship for males 
#and females.
#
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
label(age)            <- 'Age'      # label is in Hmisc
label(cholesterol)    <- 'Total Cholesterol'
label(blood.pressure) <- 'Systolic Blood Pressure'
label(sex)            <- 'Sex'
units(cholesterol)    <- 'mg/dl'   # uses units.default in Hmisc
units(blood.pressure) <- 'mmHg'


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)
cholesterol[1:3] <- NA   # 3 missings, at random


ddist <- datadist(age, blood.pressure, cholesterol, sex)
options(datadist='ddist')


fit <- lrm(y ~ blood.pressure + sex * (age + rcs(cholesterol,4)),
               x=TRUE, y=TRUE)
#      x=TRUE, y=TRUE allows use of resid(), which.influence below
#      could define d <- datadist(fit) after lrm(), but data distribution
#      summary would not be stored with fit, so later uses of plot.Design
#      or summary.Design would require access to the original dataset or
#      d or specifying all variable values to summary, plot, nomogram
anova(fit)
plot(fit, age=NA, sex=NA)
plot(fit, age=20:70, sex="male")   # need if datadist not used
print(cbind(resid(fit,"dfbetas"), resid(fit,"dffits"))[1:20,])
which.influence(fit, .3)
# latex(fit)                       #print nice statement of fitted model
#
#Repeat this fit using penalized MLE, penalizing complex terms
#(for nonlinear or interaction effects)
#
fitp <- update(fit, penalty=list(simple=0,nonlinear=10), x=TRUE, y=TRUE)
effective.df(fitp)
# or lrm(y ~ \dots, penalty=\dots)


#Get fits for a variety of penalties and assess predictive accuracy 
#in a new data set.  Program efficiently so that complex design 
#matrices are only created once.


set.seed(201)
x1 <- rnorm(500)
x2 <- rnorm(500)
x3 <- sample(0:1,500,rep=TRUE)
L  <- x1+abs(x2)+x3
y  <- ifelse(runif(500)<=plogis(L), 1, 0)
new.data <- data.frame(x1,x2,x3,y)[301:500,]
#
for(penlty in seq(0,.15,by=.005)) {
  if(penlty==0) {
    f <- lrm(y ~ rcs(x1,4)+rcs(x2,6)*x3, subset=1:300, x=TRUE, y=TRUE)
    # True model is linear in x1 and has no interaction
    X <- f$x    # saves time for future runs - don't have to use rcs etc.
    Y <- f$y    # this also deletes rows with NAs (if there were any)
    penalty.matrix <- diag(diag(var(X)))
    Xnew <- predict(f, new.data, type="x", incl.non.slopes=FALSE)  
    # expand design matrix for new data
    Ynew <- new.data$y
  } else f <- lrm.fit(X,Y, penalty.matrix=penlty*penalty.matrix)
#
  cat("\nPenalty :",penlty,"\n")
  pred.logit <- f$coef[1] + (Xnew \%*\% f$coef[-1])
  pred <- plogis(pred.logit)
  C.index <- somers2(pred, Ynew)["C"]
  Brier   <- mean((pred-Ynew)^2)
  Deviance<- -2*sum( Ynew*log(pred) + (1-Ynew)*log(1-pred) )
  cat("ROC area:",format(C.index),"   Brier score:",format(Brier),
      "   -2 Log L:",format(Deviance),"\n")
}
#penalty=0.045 gave lowest -2 Log L, Brier, ROC in test sample for S+
#
#Use bootstrap validation to estimate predictive accuracy of
#logistic models with various penalties
#To see how noisy cross-validation estimates can be, change the
#validate(f, \dots) to validate(f, method="cross", B=10) for example.
#You will see tremendous variation in accuracy with minute changes in
#the penalty.  This comes from the error inherent in using 10-fold
#cross validation but also because we are not fixing the splits.  
#20-fold cross validation was even worse for some
#indexes because of the small test sample size.  Stability would be
#obtained by using the same sample splits for all penalty values 
#(see above), but then we wouldn't be sure that the choice of the 
#best penalty is not specific to how the sample was split.  This
#problem is addressed in the last example.
#
penalties <- seq(0,.7,by=.1)   # really use by=.02
index <- matrix(NA, nrow=length(penalties), ncol=9,
	        dimnames=list(format(penalties),
          c("Dxy","R2","Intercept","Slope","Emax","D","U","Q","B")))
i <- 0
for(penlty in penalties) {
  cat(penlty, "")
  i <- i+1
  if(penlty==0) {
    f <- lrm(y ~ rcs(x1,4)+rcs(x2,6)*x3, x=TRUE, y=TRUE)  # fit whole sample
    X <- f$x
    Y <- f$y
    penalty.matrix <- diag(diag(var(X)))   # save time - only do once
  } else f <- lrm(Y ~ X, penalty=penlty,
                  penalty.matrix=penalty.matrix, x=TRUE,y=TRUE)
  val <- validate(f, method="boot", B=20)  # use larger B in practice
  index[i,] <- val[,"index.corrected"]
}
par(mfrow=c(3,3))
for(i in 1:9) {
  plot(penalties, index[,i], 
       xlab="Penalty", ylab=dimnames(index)[[2]][i])
  lines(lowess(penalties, index[,i]))
}
options(datadist=NULL)

# Example of weighted analysis
x <- 1:5
y <- c(0,1,0,1,0)
reps <- c(1,2,3,2,1)
lrm(y ~ x, weights=reps)
x <- rep(x, reps)
y <- rep(y, reps)
lrm(y ~ x)   # same as above

#
#Study performance of a modified AIC which uses the effective d.f.
#See Verweij and Van Houwelingen (1994) Eq. (6).  Here AIC=chisq-2*df.
#Also try as effective d.f. equation (4) of the previous reference.
#Also study performance of Shao's cross-validation technique (which was
#designed to pick the "right" set of variables, and uses a much smaller
#training sample than most methods).  Compare cross-validated deviance
#vs. penalty to the gold standard accuracy on a 7500 observation dataset.
#Note that if you only want to get AIC or Schwarz Bayesian information
#criterion, all you need is to invoke the pentrace function.
#NOTE: the effective.df( ) function is used in practice
#
\dontrun{
for(seed in c(339,777,22,111,3)){ 
# study performance for several datasets
  set.seed(seed)
  n <- 175; p <- 8
  X <- matrix(rnorm(n*p), ncol=p) # p normal(0,1) predictors
  Coef <- c(-.1,.2,-.3,.4,-.5,.6,-.65,.7)  # true population coefficients
  L <- X \%*\% Coef                 # intercept is zero
  Y <- ifelse(runif(n)<=plogis(L), 1, 0)
  pm <- diag(diag(var(X)))
  #Generate a large validation sample to use as a gold standard
  n.val <- 7500
  X.val <- matrix(rnorm(n.val*p), ncol=p)
  L.val <- X.val \%*\% Coef
  Y.val <- ifelse(runif(n.val)<=plogis(L.val), 1, 0)
  #
  Penalty <- seq(0,30,by=1)
  reps <- length(Penalty)
  effective.df <- effective.df2 <- aic <- aic2 <- deviance.val <- 
    Lpenalty <- single(reps)
  n.t <- round(n^.75)
  ncv <- c(10,20,30,40)     # try various no. of reps in cross-val.
  deviance <- matrix(NA,nrow=reps,ncol=length(ncv))
  #If model were complex, could have started things off by getting X, Y
  #penalty.matrix from an initial lrm fit to save time
  #
  for(i in 1:reps) {
    pen <- Penalty[i]
    cat(format(pen),"")
    f.full <- lrm.fit(X, Y, penalty.matrix=pen*pm)
    Lpenalty[i] <- pen* t(f.full$coef[-1]) \%*\% pm \%*\% f.full$coef[-1]
    f.full.nopenalty <- lrm.fit(X, Y, initial=f.full$coef, maxit=1)
    info.matrix.unpenalized <- solve(f.full.nopenalty$var)
    effective.df[i] <- sum(diag(info.matrix.unpenalized \%*\% f.full$var)) - 1
    lrchisq <- f.full.nopenalty$stats["Model L.R."]
    # lrm does all this penalty adjustment automatically (for var, d.f.,
    # chi-square)
    aic[i] <- lrchisq - 2*effective.df[i]
    #
    pred <- plogis(f.full$linear.predictors)
    score.matrix <- cbind(1,X) * (Y - pred)
    sum.u.uprime <- t(score.matrix) \%*\% score.matrix
    effective.df2[i] <- sum(diag(f.full$var \%*\% sum.u.uprime))
    aic2[i] <- lrchisq - 2*effective.df2[i]
    #
    #Shao suggested averaging 2*n cross-validations, but let's do only 40
    #and stop along the way to see if fewer is OK
    dev <- 0
    for(j in 1:max(ncv)) {
      s    <- sample(1:n, n.t)
      cof  <- lrm.fit(X[s,],Y[s], 
                      penalty.matrix=pen*pm)$coef
      pred <- cof[1] + (X[-s,] \%*\% cof[-1])
      dev <- dev -2*sum(Y[-s]*pred + log(1-plogis(pred)))
      for(k in 1:length(ncv)) if(j==ncv[k]) deviance[i,k] <- dev/j
    }
    #
    pred.val <- f.full$coef[1] + (X.val \%*\% f.full$coef[-1])
    prob.val <- plogis(pred.val)
    deviance.val[i] <- -2*sum(Y.val*pred.val + log(1-prob.val))
  }
  postscript(hor=TRUE)   # along with graphics.off() below, allow plots
  par(mfrow=c(2,4))   # to be printed as they are finished
  plot(Penalty, effective.df, type="l")
  lines(Penalty, effective.df2, lty=2)
  plot(Penalty, Lpenalty, type="l")
  title("Penalty on -2 log L")
  plot(Penalty, aic, type="l")
  lines(Penalty, aic2, lty=2)
  for(k in 1:length(ncv)) {
    plot(Penalty, deviance[,k], ylab="deviance")
    title(paste(ncv[k],"reps"))
    lines(supsmu(Penalty, deviance[,k]))
  }
  plot(Penalty, deviance.val, type="l")
  title("Gold Standard (n=7500)")
  title(sub=format(seed),adj=1,cex=.5)
  graphics.off()
}
}
#The results showed that to obtain a clear picture of the penalty-
#accuracy relationship one needs 30 or 40 reps in the cross-validation.
#For 4 of 5 samples, though, the super smoother was able to detect
#an accurate penalty giving the best (lowest) deviance using 10-fold
#cross-validation.  Cross-validation would have worked better had
#the same splits been used for all penalties.
#The AIC methods worked just as well and are much quicker to compute.
#The first AIC based on the effective d.f. in Gray's Eq. 2.9
#(Verweij and Van Houwelingen (1994) Eq. 5 (note typo)) worked best.
}
\keyword{category}
\keyword{models}
\concept{logistic regression model}
\concept{ordinal logistic model}
\concept{proportional odds model}
\concept{continuation ratio model}
\concept{ordinal response}

\eof
\name{lrm.fit}
\alias{lrm.fit}
\title{Logistic Model Fitter}
\description{
Fits a binary or ordinal logistic model for a given design matrix and response
vector with no missing values in either.  Ordinary or penalized maximum
likelihood estimation is used.
}
\usage{
lrm.fit(x, y, offset, initial, est, maxit=12, eps=.025,
        tol=1E-7, trace=FALSE, penalty.matrix, weights, normwt)
}
\arguments{
\item{x}{
design matrix with no column for an intercept
}
\item{y}{
response vector, numeric, categorical, or character
}
\item{offset}{optional numeric vector containing an offset on the logit scale}
\item{initial}{
vector of initial parameter estimates, beginning with the
intercept
}
\item{est}{
indexes of \code{x} to fit in the model (default is all columns of \code{x}).
Specifying \code{est=c(1,2,5)} causes columns 1,2, and 5 to have
parameters estimated. The score vector \code{u} and covariance matrix \code{var}
can be used to obtain score statistics for other columns
}
\item{maxit}{
maximum no. iterations (default=\code{12}). Specifying \code{maxit=1}
causes logist to compute statistics at initial estimates.
}
\item{eps}{
difference in \code{-2  log} likelihood for declaring convergence.
Default is \code{.025}.
}
\item{tol}{
Singularity criterion. Default is 1E-7
}
\item{trace}{
set to \code{TRUE} to print -2 log likelihood, step-halving
fraction, and rank of variance matrix at each iteration
}
\item{penalty.matrix}{
  a self-contained ready-to-use penalty matrix - see \code{lrm}
}
\item{weights}{
  a vector (same length as \code{y}) of possibly fractional case weights
}
\item{normwt}{
 set to code{TRUE} to scale \code{weights} so they sum to the length of
 \code{y}; useful for sample surveys as opposed to the default of
 frequency weighting 
 }
}

\value{
a list with the following components:

\item{call}{
calling expression
}
\item{freq}{
table of frequencies for \code{y} in order of increasing \code{y}
}
\item{stats}{
vector with the following elements: number of observations used in the
fit, maximum absolute value of first
derivative of log likelihood, model likelihood ratio chi-square, d.f.,
P-value,
\eqn{c} index (area under ROC curve), Somers' \eqn{D_{xy}},
Goodman-Kruskal \eqn{\gamma}{gamma}, and Kendall's \eqn{\tau_a}{tau-a}
rank correlations 
between predicted probabilities and observed response, the
Nagelkerke \eqn{R^2} index, and the Brier probability score with
respect to computing the probability that \eqn{y >} lowest level. 
Probabilities are rounded to the nearest 0.002
in the computations or rank correlation indexes.
When \code{penalty.matrix} is present, the \eqn{\chi^2}{chi-square},
d.f., and P-value are not corrected for the effective d.f.
}
\item{fail}{
set to \code{TRUE} if convergence failed (and \code{maxiter>1})
}
\item{coefficients}{
estimated parameters
}
\item{var}{
estimated variance-covariance matrix (inverse of information matrix).
Note that in the case of penalized estimation, \code{var} is not the
improved sandwich-type estimator (which \code{lrm} does compute).
}
\item{u}{
vector of first derivatives of log-likelihood
}
\item{deviance}{
-2 log likelihoods. 
When an offset variable is present, three
deviances are computed: for intercept(s) only, for
intercepts+offset, and for intercepts+offset+predictors.
When there is no offset variable, the vector contains deviances for
the intercept(s)-only model and the model with intercept(s) and predictors.
}
\item{est}{
vector of column numbers of \code{X} fitted (intercepts are not counted)
}
\item{non.slopes}{
number of intercepts in model
}
\item{penalty.matrix}{
see above
}}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{lrm}}, \code{\link{glm}}, \code{\link{matinv}}, \code{\link[Hmisc]{solvet}}, \code{\link{cr.setup}}
}
\examples{
#Fit an additive logistic model containing numeric predictors age, 
#blood.pressure, and sex, assumed to be already properly coded and 
#transformed
#
# fit <- lrm.fit(cbind(age,blood.pressure,sex), death)
}
\keyword{models}
\keyword{regression}
\concept{logistic regression model}

\eof
\name{matinv}
\alias{matinv}
\title{
Total and Partial Matrix Inversion using Gauss-Jordan Sweep Operator
}
\description{
  This function inverts or partially inverts a matrix using pivoting
  (the sweep operator).  It is useful for sequential model-building.
  }
\usage{
matinv(a, which, negate=TRUE, eps=1e-12)
}
\arguments{
\item{a}{
square matrix to invert or partially invert.  May have been inverted or
partially inverted previously by matinv, in which case its "swept"
attribute is updated.  Will un-invert if already inverted.
}
\item{which}{
vector of column/row numbers in a to invert.  Default is all, for total
inverse.
}
\item{negate}{
So that the algorithm can keep track of which pivots have been swept
as well as roundoff errors, it actually returns the negative of the
inverse or partial inverse.  By default, these elements are negated to
give the usual expected result.  Set negate=FALSE if you will be passing
the result right back into matinv, otherwise, negate the submatrix
before sending back to matinv.
}
\item{eps}{
singularity criterion
}}
\value{
a square matrix, with attributes "rank" and "swept".
}
\references{
Clarke MRB (1982).  Algorithm AS 178: The Gauss-Jordan sweep operator
with detection of collinearity.  Appl Statist 31:166--9.


Ridout MS, Cobb JM (1986).  Algorithm AS R78 : A remark on algorithm AS 178:
The Gauss-Jordan
sweep operator with detection of collinearity.  Appl Statist 38:420--2.
}
\seealso{
lrm, solve
}
\examples{
a      <- diag(1:3)
a.inv1 <- matinv(a, 1, negate=FALSE)	     #Invert with respect to a[1,1]
a.inv1
a.inv  <- -matinv(a.inv1, 2:3, negate=FALSE) #Finish the job
a.inv
solve(a)
}
\keyword{array}
% Converted by Sd2Rd version 1.21.

\eof
\name{nomogram}
\alias{nomogram}
\alias{nomogram.Design}
\alias{print.nomogram}
\alias{legend.nomabbrev}
\title{
Draw a Nomogram
}
\description{
Draws a partial nomogram that can be used to manually obtain predicted
values from a regression model that was fitted with \code{Design} in effect.
The nomogram does not have lines representing sums, but it has a reference
line for reading scoring points (default range 0-100).  Once the reader
manually totals the points, the predicted values can be read at the bottom.
Non-monotonic transformations of continuous variables are handled (scales
wrap around), as
are transformations which have flat sections (tick marks are labeled
with ranges).  If interactions are in the model, one variable
is picked as the "axis variable", and separate axes are constructed for
each level of the interacting factors (preference is given automatically
to using any discrete factors to construct separate axes) and
levels of factors which are indirectly related to interacting
factors (see DETAILS).  Thus the nomogram is designed so that only
one axis is actually read for each variable, since the variable
combinations are disjoint.  For
categorical interacting factors, the default is to construct axes for
all levels.
The user may specify
coordinates of each predictor to label on its axis, or use default values.
If a factor interacts with other factors, settings for one or more of
the interacting factors may be specified separately (this is mandatory
for continuous variables).  Optional confidence intervals will be
drawn for individual scores as well as for the linear predictor.
If more than one confidence level is chosen, multiple levels may be
displayed using different colors or gray scales.  Functions of the
linear predictors may be added to the nomogram.

\code{print.nomogram} prints axis information stored in an object returned
by \code{nomogram}.  This is useful in producing tables of point assignments
by levels of predictors.  It also prints how many linear predictor
units there are per point and the number of points per unit change in
the linear predictor.

\code{legend.nomabbrev} draws legends describing abbreviations used for
labeling tick marks for levels of categorical predictors.
}
\usage{
nomogram(fit, \dots)

\method{nomogram}{Design}(fit, \dots, adj.to,
         lp=TRUE, lp.at, lplabel="Linear Predictor", 
         fun, fun.at, fun.lp.at,
         funlabel="Predicted Value", fun.side, 
         interact=NULL, intercept=1, conf.int=FALSE, 
         col.conf=c(1, if(under.unix).3 else 12), 
         conf.space=c(.08,.2),
         conf.lp=c("representative","all", "none"),
         est.all=TRUE, abbrev=FALSE, minlength=4, 
         maxscale=100, nint=10, label.every=1, force.label=FALSE,
         xfrac=0.35, cex.axis=0.85, cex.var=1, col.grid=FALSE, 
         vnames=c("labels","names"), varname.label=TRUE, 
         varname.label.sep="=",
         ia.space=.7, tck=-.009, lmgp=.4, omit=NULL, naxes,
         points.label='Points', total.points.label='Total Points',
         total.sep.page=FALSE, total.fun, verbose=FALSE)

\method{print}{nomogram}(x, dec=0, \dots)

legend.nomabbrev(object, which, x, y, ncol=3, \dots)
}
\arguments{
\item{fit}{
a regression model fit that was created with \code{library(Design)} in
effect, and (usually) with \code{options(datadist="object.name")} in effect.
}
\item{object}{
the result returned from \code{nomogram}
}
\item{which}{
a character string giving the name of a variable for which to draw a
legend with abbreviations of factor levels
}
\item{x}{}
\item{y}{
coordinates to pass to the \code{legend} function.  This is the upper left
corner of the legend box.  You can omit \code{y} if \code{x} is a list with
named elements \code{x} and \code{y}.  To use the mouse to locate the legend,
specify \code{locator(1)} for \code{x}.  For \code{print}, \code{x} is
the result of \code{nomogram}.
}
\item{\dots}{
settings of variables to use in constructing axes.  If \code{datadist}
was in effect, the default is to use \code{pretty(total range, nint)}
for continuous variables, and the class levels for discrete ones. 
For \code{legend.nomabbrev}, \code{\dots} specifies optional parameters to pass
to \code{legend}.  Common ones are \code{bty="n"} to suppress drawing the
box.  You may want to specify a non-proportionally spaced font
(e.g., courier) number if abbreviations are more than one letter long.
This will make the abbreviation definitions line up (e.g., specify
\code{font=2}, the default for courier).  Ignored for \code{print}.
}
\item{adj.to}{
If you didn't define \code{datadist} for all predictors, you will have to
define adjustment settings for the undefined ones, e.g.
\code{adj.to=list(age=50, sex="female")}.
}
\item{interact}{
When a continuous variable interacts with a discrete one, axes are
constructed so that the continuous variable moves within the axis, and
separate axes represent levels of interacting factors.  For interactions
between two continuous variables, all but the axis variable must have
discrete levels defined in \code{interact}.  
For discrete interacting factors, you may specify levels to use in
constructing the multiple axes.  For continuous interacting factors,
you must do this.  Examples: \code{interact=list(age=seq(10,70,by=10),
treat=c("A","B","D"))}.
}
\item{lp}{
Set to \code{FALSE} to suppress creation of an axis for scoring
\eqn{X\beta}{X beta}.
}
\item{lp.at}{
If \code{lp=TRUE}, \code{lp.at} may specify a vector of settings of
\eqn{X\beta}{X beta}.
Default is to use \code{pretty(range of linear predictors, nint)}.
}
\item{lplabel}{
label for linear predictor axis.  Default is \code{"Linear Predictor"}.
}
\item{fun}{
an optional function to transform the linear predictors, and to plot
on another axis.  If more than one transformation is plotted, put
them in a list, e.g. \code{list(function(x)x/2, function(x)2*x)}.
Any function values equal to NA will be ignored.  
}
\item{fun.at}{
function values to label on axis.  Default \code{fun} evaluated
at \code{lp.at}.   If more than one \code{fun} was specified, using a vector
for \code{fun.at} will cause all functions to be evaluated at the same
argument values.  To use different values, specify a list of vectors for
\code{fun.at}, with elements corresponding to the different functions
(lists of vectors also applies to \code{fun.lp.at} and \code{fun.side}).
}
\item{fun.lp.at}{
If you want to
evaluate one of the functions at a different set of linear predictor
values than may have been used in constructing the linear predictor axis,
specify a vector or list of vectors 
of linear predictor values at which to evaluate the function.  This is
especially useful for discrete functions.  The presence of this attribute
also does away with the need for \code{nomogram} to compute numerical approximations of 
the inverse of the function.  It also allows the user-supplied function
to return \code{factor} objects, which is useful when e.g. a single tick
mark position actually represents a range.
If the \code{fun.lp.at} parameter is present, the \code{fun.at}
vector for that function is ignored.
}
\item{fun.side}{
a vector or list of vectors of \code{side} parameters for the \code{axis} function
for labeling function values.
Values may be 1 to position a tick mark label below the axis (the default),
or 3 for above the axis.  If for example an axis has 5 tick mark labels
and the second and third will run into each other, specify
\code{fun.side=c(1,1,3,1,1)} (assuming only one function is specified as \code{fun}).
}
\item{funlabel}{
label for \code{fun} axis.  If more than one function was given but
funlabel is of length one, it will be duplicated as needed.  If \code{fun} is
a list of functions for which you specified names (see the final example
below), these names will be used as labels.
}
\item{conf.int}{
confidence levels to display for each scoring.  Default is \code{FALSE} to display
no confidence limits.  Setting \code{conf.int} to \code{TRUE} is the same as
setting it to \code{c(0.7, 0.9)},
with the line segment between the 0.7 and 0.9 levels shaded using
gray scale.  
}
\item{col.conf}{
colors corresponding to \code{conf.int}.  Use fractions for gray scale
(for UNIX S-PLUS).
}
\item{conf.space}{
a 2-element vector with the vertical range within which to draw
confidence bars, in units of 1=spacing between main bars.  Four heights
are used within this range (8 for the linear predictor if more than
16 unique values were evaluated), cycling them among separate confidence
intervals to reduce overlapping.
}
\item{conf.lp}{
default is \code{"representative"} to group all linear predictors evaluated
into deciles, and to show, for the linear predictor confidence intervals,
only the mean linear predictor within the deciles along with the median
standard error within the deciles.  Set \code{conf.lp="none"} to suppress
confidence limits for the linear predictors, and to \code{"all"} to show
all confidence limits.
}
\item{intercept}{
for models such as the ordinal logistic model with multiple intercepts,
specifies which one to use in evaluating the linear predictor.
}
\item{est.all}{
To plot axes for only the subset of variables named in \code{\dots}, set
\code{est.all=FALSE}.  Note: This option only works when zero has a special
meaning for the variables that are omitted from the graph.
}
\item{abbrev}{
Set to \code{TRUE} to use the \code{abbreviate} function to abbreviate levels of
categorical factors, both for labeling tick marks and for axis titles.
If you only want to abbreviate certain predictor variables, set \code{abbrev}
to a vector of character strings containing their names.
}
\item{minlength}{
applies if \code{abbrev=TRUE}.  Is the minimum abbreviation length passed to the
\code{abbreviate} function.  If you set \code{minlength=1}, the letters of the
alphabet are used to label tick marks for categorical predictors, and
all letters are drawn no matter how close together they are.  For
labeling axes (interaction settings), \code{minlength=1} causes
\code{minlength=4} to be used.
}
\item{maxscale}{
default maximum point score is 100
}
\item{nint}{
number of intervals to label for axes representing continuous variables.
See \code{pretty}.
}
\item{label.every}{
Specify \code{label.every=i} to label on every \code{i}th tick mark.
}
\item{force.label}{
set to \code{TRUE} to force every tick mark intended to be labeled to have
a label plotted (whether the labels run into each other or not)
}
\item{xfrac}{
fraction of horizontal plot to set aside for axis titles
}
\item{cex.axis}{
character size for tick mark labels
}
\item{cex.var}{
character size for axis titles (variable names)
}
\item{col.grid}{
If \code{col.grid=1}, no gray scale is used, but an ordinary line is drawn.  
If \code{0<col.grid<1},
a \code{col} (gray scale) of \code{col.grid} is used to draw vertical reference
lines for major axis divisions and \code{col.grid/2} for minor divisions.
The default is \code{col.grid=FALSE}, i.e., reference lines are omitted.
Specifying \code{col.grid=TRUE} is the same as specifying a gray scale level
of \code{col.grid=.2} (5 for Windows S-PLUS).
}
\item{vnames}{
By default, variable labels are used to label axes.  Set \code{vnames="names"}
to instead use variable names.
}
\item{varname.label}{
In constructing axis titles for interactions, the default is to add
\code{"(interacting.varname=level)} on the right.  Specify \code{varname.label=FALSE}
to instead use \code{"(level)"}.
}
\item{varname.label.sep}{
If \code{varname.label=TRUE}, you can change the separator to something other than
\code{=} by specifying this parameter.
}
\item{ia.space}{
When multiple axes are draw for levels of interacting factors, the
default is to group combinations related to a main effect.  This is
done by spacing the axes for the second to last of these 
within a group only
0.7 (by default) of the way down as compared with normal space of 1 unit.
}
\item{tck}{
see \code{tck} under \code{par}
}
\item{lmgp}{
spacing between numeric axis labels and axis (see \code{par} for \code{mgp})
}
\item{omit}{
vector of character strings containing names of variables for which to
suppress drawing axes.  Default is to show all variables.
}
\item{naxes}{
maximum number of axes to allow on one plot.  If the nomogram requires more
than one "page", the "Points" axis will be repeated at the top of
each page when necessary.
}
\item{points.label}{
a character string giving the axis label for the points scale
}
\item{total.points.label}{
a character string giving the axis label for the total points scale
}
\item{total.sep.page}{
set to \code{TRUE} to force the total points and later axes to be placed on a
separate page
}
\item{total.fun}{
a user-provided function that will be executed before the total points
axis is drawn.  Default is not to execute a function.  This is useful e.g.
when \code{total.sep.page=TRUE} and you wish to use \code{locator} to find the
coordinates for positioning an abbreviation legend before it's too late
and a new page is started (i.e., \code{total.fun=function()print(locator(1))}).
}
\item{verbose}{
set to \code{TRUE} to get printed output detailing how tick marks are chosen
and labeled for function axes.  This is useful in seeing how certain
linear predictor values cannot be solved for using inverse linear
interpolation on the (requested linear predictor values, function values at 
these lp values).  When this happens you will see \code{NA}s in the \code{verbose}
output, and the corresponding tick marks will not appear in the nomogram.
}
\item{dec}{
number of digits to the right of the decimal point, for rounding
point scores in \code{print.nomogram}.  Default is to round to the nearest
whole number of points.
}
\item{ncol}{
the number of columns to form in drawing the legend.
}}
\value{
a list of class \code{"nomogram"} that contains information used in plotting
the axes.  If you specified \code{abbrev=TRUE}, a list called \code{abbrev} is also
returned that gives the abbreviations used for tick mark labels, if any.  
This list is useful for
making legends and is used by \code{legend.nomabbrev} (see the last example).
The returned list also has components called \code{total.points}, \code{lp},
and the function axis names.  These components have components
\code{x} (\code{at} argument vector given to \code{axis}), \code{y} (\code{pos} for \code{axis}),
and \code{x.real}, the x-coordinates appearing on tick mark labels.
An often useful result is stored in the list of data for each axis variable,
namely the exact number of points that correspond to each tick mark on
that variable's axis.
}
\details{
A variable is considered to be discrete if it is categorical or ordered
or if \code{datadist} stored \code{values} for it (meaning it had \code{<11} unique
values).
A variable is said to be indirectly related to another variable if
the two are related by some interaction.  For example, if a model
has variables a, b, c, d, and the interactions are a:c and c:d,
variable d is indirectly related to variable a.  The complete list
of variables related to a is c, d.  If an axis is made for variable a,
several axes will actually be drawn, one for each combination of c
and d specified in \code{interact}.


Note that with a caliper, it is easy to continually add point scores
for individual predictors, and then to place the caliper on the upper
\code{Points} axis (with extrapolation if needed).  Then transfer these
points to the
\code{Total Points} axis.  In this way, points can be added without
without writing them down.


Confidence limits for an individual predictor score are really confidence
limits for the entire linear predictor, with other predictors set to
adjustment values.  If \code{lp=TRUE}, all confidence bars for all linear
predictor values evaluated are drawn.  The extent to which multiple
confidence bars of differing widths appear at the same linear predictor
value means that precision depended on how the linear predictor was
arrived at (e.g., a certain value may be realized from a setting of
a certain predictor that was associated with a large standard error
on the regression coefficients for that predictor).


On occasion, you may want to reverse the regression coefficients of a model
to make the "points" scales reverse direction.  For parametric survival
models, which are stated in terms of increasing regression effects meaning
longer survival (the opposite of a Cox model), just do something like
\code{fit$coefficients <- -fit$coefficients} before invoking \code{nomogram}, 
and if you add function axes, negate the function
arguments.  For the Cox model, you also need to negate \code{fit$center}.
If you omit \code{lp.at}, also negate \code{fit$linear.predictors}.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Banks J: Nomograms. Encylopedia of Statistical Sciences, Vol 6.
Editors: S Kotz and NL Johnson.  New York: Wiley; 1985.


Lubsen J, Pool J, van der Does, E: A practical device for the application
of a diagnostic or prognostic function.  Meth. Inform. Med. 17:127--129; 1978.
}
\seealso{
\code{\link{Design}}, \code{\link{plot.Design}}, \code{\link{plot.summary.Design}}, \code{\link{axis}}, \code{\link{pretty}}, \code{\link{approx}},
\code{\link{latex.Design}}, \code{\link{Design.Misc}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


ddist <- datadist(age, blood.pressure, cholesterol, sex)
options(datadist='ddist')


f <- lrm(y ~ lsp(age,50)+sex*rcs(cholesterol,4)+blood.pressure)
nomogram(f, fun=function(x)1/(1+exp(-x)),  # or fun=plogis
    fun.at=c(.001,.01,.05,seq(.1,.9,by=.1),.95,.99,.999),
    funlabel="Risk of Death", xfrac=.45)
#Instead of fun.at, could have specified fun.lp.at=logit of
#sequence above - faster and slightly more accurate
nomogram(f, age=seq(10,90,by=10), xfrac=.45)
g <- lrm(y ~ sex + rcs(age,3)*rcs(cholesterol,3))
nomogram(g, interact=list(age=c(20,40,60)), 
         conf.int=c(.7,.9,.95), col.conf=c(1,.5,.2))


cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
d.time <- -log(runif(n))/h
death <- ifelse(d.time <= cens,1,0)
d.time <- pmin(d.time, cens)


f <- psm(Surv(d.time,death) ~ sex*age, dist=if(.R.)'lognormal' else 'gaussian')
med  <- Quantile(f)
surv <- Survival(f)  # This would also work if f was from cph
nomogram(f, fun=function(x) med(lp=x), funlabel="Median Survival Time")
nomogram(f, fun=list(function(x) surv(3, x), function(x) surv(6, x)),
            funlabel=c("3-Month Survival Probability", 
                       "6-month Survival Probability"), xfrac=.5)


\dontrun{
nom <- nomogram(fit.with.categorical.predictors, abbrev=TRUE, minlength=1)
nom$x1$points   # print points assigned to each level of x1 for its axis
#Add legend for abbreviations for category levels
abb <- nom$abbrev$treatment
legend(locator(1), abb$full, pch=paste(abb$abbrev,collapse=''), 
       ncol=2, bty='n')  # this only works for 1-letter abbreviations
#Or use the legend.nomabbrev function:
legend.nomabbrev(nom, 'treatment', locator(1), ncol=2, bty='n')
}


#Make a nomogram with axes predicting probabilities Y>=j for all j=1-3
#in an ordinal logistic model, where Y=0,1,2,3
Y <- ifelse(y==0, 0, sample(1:3, length(y), TRUE))
g <- lrm(Y ~ age+rcs(cholesterol,4)*sex)
fun2 <- function(x) plogis(x-g$coef[1]+g$coef[2])
fun3 <- function(x) plogis(x-g$coef[1]+g$coef[3])
f <- Newlabels(g, c(age='Age in Years'))  
#see Design.Misc, which also has Newlevels to change 
#labels for levels of categorical variables
nomogram(f, fun=list('Prob Y>=1'=plogis, 'Prob Y>=2'=fun2, 
                     'Prob Y=3'=fun3), 
         fun.at=c(.01,.05,seq(.1,.9,by=.1),.95,.99),
         lmgp=.2, cex.axis=.6)
options(datadist=NULL)
}
\keyword{models}
\keyword{regression}
\keyword{hplot}
% Converted by Sd2Rd version 1.21.



\eof
\name{ols}
\alias{ols}
\title{
Linear Model Estimation Using Ordinary Least Squares
}
\description{
Fits the usual weighted or unweighted linear regression model using the
same fitting routines used by \code{lm}, but also storing the variance-covariance
matrix \code{var} and using traditional dummy-variable coding for categorical
factors.  
Also fits unweighted models using penalized least squares, with the same
penalization options as in the \code{lrm} function.  For penalized estimation,
there is a fitter function call \code{lm.pfit}.
}
\usage{
ols(formula, data, weights, subset, na.action=na.delete, 
    method="qr", model=FALSE,
    x=FALSE, y=FALSE, se.fit=FALSE, linear.predictors=TRUE,
    penalty=0, penalty.matrix, tol=1e-7, sigma,
    var.penalty=c('simple','sandwich'), \dots)
}
\arguments{
\item{formula}{
an S formula object, e.g. 
\cr
    Y ~ rcs(x1,5)*lsp(x2,c(10,20))
}
\item{data}{
name of an S data frame containing all needed variables.  Omit this to use a
data frame already in the S ``search list''.
}
\item{weights}{an optional vector of weights to be used in the fitting
          process. If specified, weighted least squares is used with
          weights \code{weights} (that is, minimizing \eqn{sum(w*e^2)});
          otherwise ordinary least squares is used.}
\item{subset}{
an expression defining a subset of the observations to use in the fit.  The default
is to use all observations.  Specify for example \code{age>50 & sex="male"} or
\code{c(1:100,200:300)}
respectively to use the observations satisfying a logical expression or those having
row numbers in the given vector.
}
\item{na.action}{
specifies an S function to handle missing data.  The default is the function \code{na.delete},
which causes observations with any variable missing to be deleted.  The main difference
between \code{na.delete} and the S-supplied function \code{na.omit} is that 
\code{na.delete} makes a list
of the number of observations that are missing on each variable in the model.
The \code{na.action} is usally specified by e.g. \code{options(na.action="na.delete")}.
}
\item{method}{
specifies a particular fitting method, or \code{"model.frame"} instead to return the model frame
of the predictor and response variables satisfying any subset or missing value
checks.
}
\item{model}{
default is \code{FALSE}.  Set to \code{TRUE} to return the model frame
as element \code{model} of the fit object.
}
\item{x}{
default is \code{FALSE}.  Set to \code{TRUE} to return the expanded design matrix as element \code{x}
(without intercept indicators) of the
returned fit object.  Set both \code{x=TRUE} if you are going to use
the \code{residuals} function later to return anything other than ordinary residuals.
}
\item{y}{
default is \code{FALSE}.  Set to \code{TRUE} to return the vector of response values 
as element \code{y} of the fit.
}
\item{se.fit}{
default is \code{FALSE}.  Set to \code{TRUE} to compute the estimated standard errors of
the estimate of \eqn{X\beta}{X beta} and store them in element \code{se.fit}
of the fit. 
}
\item{linear.predictors}{
set to \code{FALSE} to cause predicted values not to be stored
}
\item{penalty}{
}
\item{penalty.matrix}{
see \code{lrm}
}
\item{tol}{tolerance for information matrix singularity}
\item{sigma}{
If \code{sigma} is given, it is taken as the actual root mean squared error parameter for the model.  Otherwise \code{sigma} is estimated from the data using the usual formulas (except for penalized models).  It is often convenient to specify
\code{sigma=1} for models with no error, when using \code{fastbw} to find an
approximate model that predicts predicted values from the full model with
a given accuracy.
}
\item{var.penalty}{
the type of variance-covariance matrix to be stored in the \code{var}
component of the fit when penalization is used.  The default is the
inverse of the penalized information matrix.  Specify
\code{var.penalty="sandwich"} to use the sandwich estimator (see below
under \code{var}), which limited simulation studies have shown yields
variances estimates that are too low.
}
\item{\dots}{arguments to pass to \code{\link{lm.wfit}} or
  \code{\link{lm.fit}}}
}
\value{
the same objects returned from \code{lm} (unless \code{penalty} or \code{penalty.matrix}
are given - then an
abbreviated list is returned since \code{lm.pfit} is used as a fitter)
plus the design attributes
(see \code{Design}).
Predicted values are always returned, in the element \code{linear.predictors}.
The vectors or matrix stored if \code{y=TRUE} or \code{x=TRUE} have rows deleted according to \code{subset} and
to missing data, and have names or row names that come from the
data frame used as input data.  If \code{penalty} or \code{penalty.matrix} is given, 
the \code{var} matrix
returned is an improved variance-covariance matrix
for the penalized regression coefficient estimates.  If
\code{var.penalty="sandwich"} (not the default, as limited simulation
studies have found it provides variance estimates that are too low) it
is defined as 
\eqn{\sigma^{2} (X'X + P)^{-1} X'X (X'X + P)^{-1}}, where \eqn{P} is 
\code{penalty factors * penalty.matrix}, with a column and row of zeros
added for the
intercept.  When \code{var.penalty="simple"} (the default), \code{var} is
\eqn{\sigma^{2} (X'X + P)^{-1}}.
The returned list has a vector \code{stats} with named elements
\code{n, Model L.R., d.f., R2, Sigma}.  \code{Model L.R.} is the model
likelihood ratio \eqn{\chi^2}{chi-square} statistic, and \code{R2} is
\eqn{R^2}.  For penalized estimation, \code{d.f.} is the 
effective degrees of freedom, which is the sum of the elements of another
vector returned, \code{effective.df.diagonal}, minus one for the intercept.
\code{Sigma} is the penalized maximum likelihood estimate (see below).
}
\details{
For penalized estimation, the penalty factor on the log likelihood is
\eqn{-0.5 \beta' P \beta / \sigma^2}, where \eqn{P} is defined above.
The penalized maximum likelihood estimate (penalized least squares
or ridge estimate) of \eqn{\beta}{beta} is \eqn{(X'X + P)^{-1} X'Y}.
The maximum likelihood estimate of \eqn{\sigma^2} is \eqn{(sse + \beta'
  P \beta) / n}, where
\code{sse} is the sum of squared errors (residuals).
The \code{effective.df.diagonal} vector is the
diagonal of the matrix \eqn{X'X/(sse/n) \sigma^{2} (X'X + P)^{-1}}.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{anova.Design}}, \code{\link{summary.Design}}, \code{\link{predict.Design}},
\code{\link{fastbw}}, \code{\link{validate}}, \code{\link{calibrate}}, \code{\link{plot.Design}}, 
\code{\link{specs.Design}}, \code{\link{cph}}, \code{\link{lrm}}, \code{\link{which.influence}}, \code{\link{lm}}, \code{\link{summary.lm}},
\code{\link{print.ols}}, \code{\link{residuals.ols}}, \code{\link{latex.ols}}, \code{\link[Hmisc]{na.delete}}, \code{\link[Hmisc]{na.detail.response}},
\code{\link{naresid}}, \code{\link{datadist}}, \code{\link{pentrace}}, \code{\link{vif}}, \code{\link[Hmisc]{abs.error.pred}}
}
\examples{
set.seed(1)
x1 <- runif(200)
x2 <- sample(0:3, 200, TRUE)
distance <- (x1 + x2/3 + rnorm(200))^2
d <- datadist(x1,x2)
options(datadist="d")   # No d -> no summary, plot without giving all details


f <- ols(sqrt(distance) ~ rcs(x1,4) + scored(x2), x=TRUE)
# could use d <- datadist(f); options(datadist="d") at this point,
# but predictor summaries would not be stored in the fit object for
# use with plot.Design, summary.Design.  In that case, the original
# dataset or d would need to be accessed later, or all variable values
# would have to be specified to summary, plot
anova(f)
which.influence(f)
summary(f)
summary.lm(f)    # will only work if penalty and penalty.matrix not used


# Fit a complex model and approximate it with a simple one
x1 <- runif(200)
x2 <- runif(200)
x3 <- runif(200)
x4 <- runif(200)
y <- x1 + x2 + rnorm(200)
f    <- ols(y ~ rcs(x1,4) + x2 + x3 + x4)
pred <- fitted(f)   # or predict(f) or f$linear.predictors
f2   <- ols(pred ~ rcs(x1,4) + x2 + x3 + x4, sigma=1)
# sigma=1 prevents numerical problems resulting from R2=1
fastbw(f2, aics=100000)
# This will find the best 1-variable model, best 2-variable model, etc.
# in predicting the predicted values from the original model
options(datadist=NULL)
}
\keyword{models}
\keyword{regression}
% Converted by Sd2Rd version 1.21.

\eof
\name{pentrace}
\alias{pentrace}
\alias{plot.pentrace}
\alias{print.pentrace}
\alias{print.pentrace}
\alias{effective.df}
\title{
Trace AIC and BIC vs. Penalty
}
\description{
For an ordinary unpenalized fit from \code{lrm} or \code{ols} and for a vector or list of penalties, 
fits a series of logistic or linear models using penalized maximum likelihood
estimation, and saves the effective degrees of freedom, Akaike Information
Criterion (\eqn{AIC}), Schwarz Bayesian Information Criterion (\eqn{BIC}), and
Hurvich and Tsai's corrected \eqn{AIC} (\eqn{AIC_c}).  Optionally
\code{pentrace} can 
use the \code{nlminb} function to solve for the optimum penalty factor or
combination of factors penalizing different kinds of terms in the model.
The \code{effective.df} function prints the original and effective
degrees of freedom for a penalized fit or for an unpenalized fit and
the best penalization determined from a previous invocation of
\code{pentrace} if \code{method="grid"} (the default).
The effective d.f. is computed separately for each class of terms in
the model (e.g., interaction, nonlinear).
A \code{plot} method exists to plot the results, and a \code{print} method exists
to print the most pertinent components.  Both \eqn{AIC} and \eqn{BIC}
may be plotted if 
there is only one penalty factor type specified in \code{penalty}.  Otherwise,
the first two types of penalty factors are plotted, showing only the \eqn{AIC}.
}
\usage{
pentrace(fit, penalty, penalty.matrix, 
         method=c('grid','optimize'),
         which=c('aic.c','aic','bic'), target.df,
         fitter, pr=FALSE, tol=1e-7,
         keep.coef=FALSE, complex.more=TRUE, verbose=FALSE, maxit=12, subset)

effective.df(fit, object)

\method{print}{pentrace}(x, \dots)

\method{plot}{pentrace}(x, method=c('points','image'), 
     which=c('effective.df','aic','aic.c','bic'), pch=2, add=FALSE, 
     ylim, \dots)
}
\arguments{
\item{fit}{
a result from \code{lrm} or \code{ols} with \code{x=TRUE, y=TRUE} and without using \code{penalty} or
\code{penalty.matrix}
(or optionally using penalization in the case of \code{effective.df})
}
\item{penalty}{
can be a vector or a list.  If it is a vector, all types of terms in
the model will be penalized by the same amount, specified by elements in
\code{penalty}, with a penalty of zero automatically added.  \code{penalty} can
also be a list in the format documented in the \code{lrm} function, except that
elements of the list can be vectors.  The \code{expand.grid} function is
invoked by \code{pentrace} to generate all possible combinations of
penalties.  For example, specifying 
\code{penalty=list(simple=1:2, nonlinear=1:3)} will generate 6 combinations
to try, so that the analyst can attempt to determine whether penalizing
more complex terms in the model more than the linear or categorical
variable terms will be beneficial.  If \code{complex.more=TRUE}, it is assumed
that the variables given in \code{penalty} are listed in order from less
complex to more complex.  With \code{method="optimize"} \code{penalty} specifies
an initial guess for the penalty or penalties.  If all term types are
to be equally penalized, \code{penalty} should be a single number,
otherwise it should be a list containing single numbers as elements,
e.g., \code{penalty=list(simple=1, nonlinear=2)}.  Experience has shown that the optimization algorithm is more likely to find a reasonable solution when the starting value specified in \code{penalty} is too large rather than too small.
}
\item{object}{
an object returned by \code{pentrace}.  For \code{effective.df}, \code{object} can be
omitted if the \code{fit} was penalized.
}
\item{penalty.matrix}{
see \code{lrm}
}
\item{method}{
The default is \code{method="grid"} to print various indexes for all
combinations of penalty parameters given by the user.  Specify
\code{method="optimize"} to have \code{pentrace} use \code{nlminb} to solve for the
combination of penalty parameters that gives the maximum value of the
objective named in \code{which}, or, if \code{target.df} is given, to find the
combination that yields \code{target.df} effective total degrees of freedom
for the model.  When \code{target.df} is specified, \code{method} is set to
\code{"optimize"} automatically.
For \code{plot.pentrace} this parameter applies only if more than one
penalty term-type was used.  The default is to use open triangles
whose sizes are proportional to the ranks of the AICs, plotting the
first two penalty factors respectively on the x and y  axes.  Use
\code{method="image"} to plot an image plot. 
}
\item{which}{
the objective to maximize for either \code{method}.  Default is \code{"aic.c"} (corrected
AIC).
For \code{plot.pentrace}, \code{which} is a vector of names of criteria to show;
default is to plot all 4 types, with effective d.f. in its own separate plot
}
\item{target.df}{
applies only to \code{method="optimize"}.  See \code{method}.  \code{target.df} makes
sense mainly when a single type of penalty factor is specified.
}
\item{fitter}{
a fitting function.  Default is \code{lrm.fit} (\code{lm.pfit} is always used for \code{ols}).
}
\item{pr}{
set to \code{TRUE} to print intermediate results
}
\item{tol}{
tolerance for declaring a matrix singular (see \code{lrm.fit, solvet})
}
\item{keep.coef}{
set to \code{TRUE} to store matrix of regression  coefficients for all the fits (corresponding
to increasing values of \code{penalty}) in object \code{Coefficients} in the
returned list.  Rows correspond to penalties, columns to regression
parameters.
}
\item{complex.more}{
By default if \code{penalty} is a list, combinations of penalties for which
complex terms are penalized less than less complex terms will be
dropped after \code{expand.grid} is invoked.  Set \code{complex.more=FALSE} to
allow more complex terms to be penalized less.  Currently this option
is ignored for \code{method="optimize"}.
}
\item{verbose}{set to \code{TRUE} to print number of intercepts and sum
  of effective degrees of freedom}
\item{maxit}{
maximum number of iterations to allow in a model fit (default=12).
This is passed to the appropriate fitter function with the correct
argument name.  Increase \code{maxit} if you had to when fitting the
original unpenalized model.
}
\item{subset}{
a logical or integer vector specifying rows of the design and response
matrices to subset in fitting models.  This is most useful for
bootstrapping \code{pentrace} to see if the best penalty can be estimated
with little error so that variation due to selecting the optimal
penalty can
be safely ignored when bootstrapping standard errors of regression
coefficients and measures of predictive accuracy.  See an example below.
}
\item{x}{a result from \code{pentrace}}
\item{pch}{used for \code{method="points"}}
\item{add}{
set to \code{TRUE} to add to an existing plot.  In that case, the effective
d.f. plot is not re-drawn, but the AIC/BIC plot is added to.
}
\item{ylim}{
2-vector of y-axis limits for plots other than effective d.f.
}
\item{...}{
other arguments passed to \code{plot}, \code{lines}, or \code{image}
}}
\value{
a list of class \code{"pentrace"}
with elements \code{penalty, df, objective, fit, var.adj, diag, results.all}, and
optionally \code{Coefficients}.
The first 6 elements correspond to the fit that had the best objective
as named in the \code{which} argument, from the sequence of fits tried.
Here \code{fit} is the fit object from \code{fitter} which was a penalized fit,
\code{diag} is the diagonal of the matrix used to compute the effective
d.f., and \code{var.adj} is Gray (1992) Equation 2.9, which is an improved
covariance matrix for the penalized beta. \code{results.all} is a data
frame whose first few variables are the components of \code{penalty} and
whose other columns are \code{df, aic, bic, aic.c}.  \code{results.all} thus
contains a summary of results for all fits attempted.  When
\code{method="optimize"}, only two components are returned: \code{penalty} and
\code{objective}, and the object does not have a class.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Gray RJ: Flexible methods for analyzing survival data using splines,
with applications to breast cancer prognosis.  JASA 87:942--951, 1992.


Hurvich CM, Tsai, CL: Regression and time series model selection in small
samples.  Biometrika 76:297--307, 1989.
}
\seealso{
\code{\link{lrm}}, \code{\link{ols}}, \code{\link[Hmisc]{solvet}}, \code{\link{Design.Misc}}, \code{\link{image}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


f <- lrm(y ~ blood.pressure + sex * (age + rcs(cholesterol,4)),
         x=TRUE, y=TRUE)
p <- pentrace(f, seq(.2,1,by=.05))
plot(p)
p$diag      # may learn something about fractional effective d.f. 
            # for each original parameter
pentrace(f, list(simple=c(0,.2,.4), nonlinear=c(0,.2,.4,.8,1)))


# Bootstrap pentrace 5 times, making a plot of corrected AIC plot with 5 reps
n <- nrow(f$x)
plot(pentrace(f, seq(.2,1,by=.05)), which='aic.c', 
     col=1, ylim=c(30,120)) #original in black
for(j in 1:5)
  plot(pentrace(f, seq(.2,1,by=.05), subset=sample(n,n,TRUE)), 
       which='aic.c', col=j+1, add=TRUE)


# Find penalty giving optimum corrected AIC.  Initial guess is 1.0
if(!.R.) pentrace(f, 1, method='optimize')


# Find penalty reducing total regression d.f. effectively to 5
if(!.R.) pentrace(f, 1, target.df=5)


# Re-fit with penalty giving best aic.c without differential penalization
f <- update(f, penalty=p$penalty)
effective.df(f)
}
\keyword{models}
\keyword{regression}
\concept{logistic regression model}
\concept{penalized MLE}
\concept{ridge regression}
\concept{shrinkage}

\eof
% $Id: plot.Design.Rd,v 1.2 2004/05/21 19:54:06 harrelfe Exp $
\name{plot.Design}
\alias{plot.Design}
\alias{print.plot.Design}
\alias{perimeter}
\alias{lines.perimeter}
\alias{Legend}
\alias{Legend.default}
\alias{Legend.plot.Design}
\alias{datadensity}
\alias{datadensity.plot.Design}
\title{
Plot Effects of Variables
}
\description{
Plots the effect of one or two predictors on the linear
predictor or X beta scale, or on some transformation of that scale.
The predictor is always plotted in its original coding on the \eqn{x} or \eqn{y}-axis.
\code{perimeter} is a function used to generate the boundary of data to plot
when a 3-d plot is made.  It finds the area where there are sufficient
data to generate believable interaction fits.

\code{Legend} is a generic function for adding legends to an existing graph
according to the specific plot made by \code{plot.Design}.  The specific
\code{Legend} method for \code{plot.Design} is \code{Legend.plot.Design}.  It handles
legends for \code{image} plots.  For other plots with one or more curves,
make legends using the \code{label.curves} parameter.

\code{datadensity} is a function for showing the data density (raw data) on
each curve generated for curve-type plots.  This is a rug plot showing
the location/density of data values for the \eqn{x}-axis variable.  If
there was a second variable specified to \code{plot} that generated separate
curves, the data density specific to each class of points is shown.
This assumes that the second variable was categorical.  The rug plots
are drawn by \code{scat1d}.

To plot effects instead of estimates (e.g., treatment differences as a
function of interacting factors) see \code{contrast.Design} and \code{summary.Design}.
}
\usage{
perimeter(x, y, xinc=diff(range(x))/10, 
          n=10, lowess.=TRUE)

\method{plot}{Design}(x, \dots, xlim, ylim, fun, xlab, ylab, 
     conf.int=.95, conf.type=c('mean','individual'),
     add=FALSE, label.curves=TRUE,
     eye, theta=0, phi=15, perspArgs=NULL,
     lty, col=1, lwd=par('lwd'), lwd.conf=1, pch=1, 
     adj.zero=FALSE, ref.zero=FALSE, adj.subtitle, cex.adj,
     non.slopes, time=NULL, loglog=FALSE, val.lev=FALSE,
     digits=4, log="", perim,
     method=c("persp","contour","image","dotchart","default"),
     sortdot=c('neither','ascending','descending'),
     nlevels=10, name, zlim=range(zmat,na.rm=TRUE),
     vnames=c('labels','names'), abbrev=FALSE)
# Just say plot(fit, ...)

\method{print}{plot.Design}(x, \dots)

\method{lines}{perimeter}(x, \dots)

Legend(object, \dots)
\method{Legend}{plot.Design}(object, x, y, size=c(1,1), horizontal=TRUE,
       nint=50, fun, at, zlab, \dots)

\method{datadensity}{plot.Design}(object, x1, x2, \dots)
}
\arguments{
\item{fit}{
a fit object created with \code{Design()} in effect.  \code{options(datadist="d")}
must have been specified (where \code{d} was created by \code{datadist}), or
it must have been in effect with \code{fit} was created.
}
\item{\dots}{
The first variable in this list is displayed on the \eqn{x}-axis. Specify
\code{x=NA} to use the 
default display range, or any range you choose (e.g. 
\code{seq(0,100,by=2),c(2,3,7,14)}). 
The default list of values for which predictions are made
is taken as the list of unique values of the variable if they number fewer
than 11. For variables with \eqn{>10} unique values, 100 equally spaced
values in the range are used for plotting if the range is not specified.
If there is a second variable listed, and its range is \code{NA} or a single value,
that variable is displayed on the y-axis. If the second variable's range
has fewer than 40 levels, separate curves are generated for each value
of the variable. Otherwise, a three dimensional perspective plot is drawn
using 40 equally-spaced values of \code{y}. Names may be abbreviated.
\code{plot} senses that a variable is not to be displayed by checking if the list of values for
the variable is a scalar instead of a vector. Variables not specified are set to the default
adjustment value \code{limits[2]}, i.e. the median for continuous variables and a reference category for
non-continuous ones.  Due to a bug in S, the first variable mentioned
may not be named \code{x}.  This would cause the general scatterplot function
\code{plot} to be invoked by mistake.
Variables after the first or second specified to \code{plot} define adjustment settings.
For categorical variables, specify the class labels in quotes when specifying variable values.  If the levels of a categorical variable are numeric,
you may omit the quotes.  For variables not described using \code{datadist},
you must specify explicit ranges and adjustment settings for predictors
that were in the model.  Note that you can omit \code{variables} entirely.  In
that case, all non-interaction effects will be plotted automatically as
if you said \code{plot(fit, age=NA); plot(fit, sex=NA); \dots}.  In this case
you have no control over the settings of the variables for the x-axis,
i.e., \code{NA} is always assumed.

For a plot made up of multiple curves, these are extra graphical arguments
will be passed to \code{key} from \code{Legend}.  For \code{image} plots, these
arguments are passed to \code{par} and have temporary effect.
For \code{datadensity} these extra arguments are passed along to \code{scat1d}.
}
\item{x}{
first variable of a pair of predictors forming a 3-d plot, to specify
to \code{perim}.  For \code{Legend}, is either a vector of 1 or 2
\eqn{x}-coordinates or a list with elements \code{x} and \code{y} each
with 1 or 2 coordinates. For \code{method="image"} plots, 1 or 2
coordinates may be given, and for other plot types, 1 coordinate is
given.  A single coordinate represents the upper left corner of the
legend box.  For \code{Legend}, \code{x} and \code{y} are optional.  If
omitted, \code{locator} is used to position legends with the mouse.
For \code{lines.perimeter}, \code{x} is the result of \code{perimeter}.
For \code{print.plot.Design}, \code{x} is the result of
\code{plot.Design}.
}
\item{y}{
second variable of the pair for \code{perim}, or \eqn{y}-coordinates for
\code{Legend}.  If omitted, \code{x} is assumed to be a list with both
\code{x} and \code{y} components.
}
\item{xinc}{
increment in \code{x} over which to examine the density of \code{y} in \code{perimeter}
}
\item{n}{
within intervals of \code{x} for \code{perimeter}, takes the informative range of \code{y} to be
the \eqn{n}th smallest to the \eqn{n}th largest values of \code{y}.  If there aren't
at least 2\eqn{n} \code{y} values in the \code{x} interval, no \code{y} ranges are used
for that interval.
}
\item{lowess.}{
set to \code{FALSE} to not have \code{lowess} smooth the data perimeters
}
\item{xlim}{
This parameter is seldom used, as limits are usually controlled with the
\code{variables} specifications.  One reason to use \code{xlim} is to plot a
\code{factor} variable on the x-axis that was created with the \code{cut2} function
with the \code{levels.mean} option, with \code{val.lev=TRUE} specified to \code{plot.Design}. 
In this case you may want the axis to
have the range of the original variable values given to \code{cut2} rather
than the range of the means within quantile groups.
}
\item{ylim}{
Range for plotting on response variable axis. Computed by default.
}
\item{fun}{
Function used to transform \eqn{X\beta}{X beta} and its confidence interval before plotting.
For example, to transform from a logit to a probability scale, use
\code{fun=function(x)1/(1+exp(-x))} or \code{fun=plogis}, 
and to take the anti-log, specify \code{fun=exp}.
for \code{Legend}, \code{fun} is
a function for transforming tick mark labels for color or gray scale
legends for \code{method="image"}.   For example, if \code{plot.Design} is used
to make an image plot of log odds ratios, specifying \code{fun=plogis} will
cause the color legend to be labeled with probability values rather
than log odds.
}
\item{xlab}{
Label for \code{x}-axis. Default is one given to \code{asis, rcs}, etc., which may have been
the \code{"label"} attribute of the variable.
}
\item{ylab}{
Label for \code{y}-axis (\code{z}-axis if perspective plot). If \code{fun} is not given,
default is \code{"log Odds"} for
\code{lrm}, \code{"log Relative Hazard"} for \code{cph}, name of the response
variable for \code{ols}, \code{TRUE} or \code{log(TRUE)} for \code{psm}, or \code{"X * Beta"} otherwise.
If \code{fun} is given, the default is \code{""}.
If \code{time} is given, the default is
\code{"(time) (units) Survival Probability"} or
\code{"log[-log S(time)]"} depending on the \code{loglog} parameter.
}
\item{conf.int}{
Default is \code{.95}.  Specify \code{FALSE} to suppress confidence bands.
}
\item{conf.type}{
specifies the type of confidence interval.  Default is for the mean.
For \code{ols} fits there is the option of obtaining confidence limits for
individual predicted values by specifying \code{conf.type="individual"}.
}
\item{add}{
Set to \code{TRUE} to add to an existing plot without drawing new axes. Default is \code{FALSE}.
See the warning note under \code{sortdot}.
}
\item{label.curves}{
Set to \code{FALSE} to suppress labeling of separate curves.
Default is \code{TRUE}, which
causes \code{labcurve} to be invoked to place labels at positions where the
curves are most separated, labeling each curve with the full curve label.
Set \code{label.curves} to a \code{list} to specify options to
\code{labcurve}, e.g., \code{label.curves=} \code{list(method="arrow", cex=.8)}.
These option names may be abbreviated in the usual way arguments
are abbreviated.  Use for example \code{label.curves=list(keys=letters[1:5])}
to draw single lower case letters on 5 curves where they are most
separated, and automatically position a legend
in the most empty part of the plot.  The \code{col}, \code{lty}, and \code{lwd} parameters
are passed automatically to \code{labcurve} although they may be overridden
here.
}
\item{eye}{
Argument to S \code{persp} function for defining perspective in 3-d plots.
Default is \code{(-6, -6, 9)}.  This is for S-Plus only.
}
\item{theta}{}
\item{phi}{}
\item{perspArgs}{a list containing other named arguments to be passed to
  \code{persp}}
\item{lty}{
Vector of line types to use in plotting separate curves. Default is 1,2, \dots
}
\item{lwd}{
Vector of line widths corresponding to separate curves, 
default is \code{par("lwd")}.
}
\item{lwd.conf}{
scalar width of lines for confidence bands.  Default is 1.
}
\item{pch}{
symbol to use when plotting unconnected points when a categorical
variable is on the x-axis or when \code{method="dotchart"}.  Default is 1
(open circle).  See \code{points} for other values, or use the \code{show.pch} function in Hmisc.
}
\item{col}{
S color number for displaying curves.  Default is \code{1} (black).  Specify
a vector of integers to assign different colors to different curves.
}
\item{adj.subtitle}{
Set to \code{FALSE} to suppress subtitling the graph with the list of settings of non-graphed adjustment values. Default is \code{TRUE} if \code{ <= 6} non-plotted factors.
}
\item{cex.adj}{
\code{cex} parameter for size of adjustment settings in subtitles.  Default is
0.75 times \code{par("cex")}.
}
\item{adj.zero}{
Set to \code{TRUE} to adjust all non-plotted variables to 0 (or reference cell for
categorical variables) and to omit intercept(s) from consideration. Default
is \code{FALSE}.
}
\item{ref.zero}{
Subtract a constant from \eqn{X\beta}{X beta} before plotting so that
the reference value of the \code{x}-variable yields \code{y=0}.  This is
done before applying function \code{fun}.
}
\item{non.slopes}{
This is only useful in a multiple intercept model such as the ordinal
logistic model. There to use to second of three intercepts, for example,
specify \code{non.slopes=c(0,1,0)}. The default is \code{non.slopes=rep(0,k)}
if \code{adj.zero=TRUE}, where \code{k} is the number of intercepts in the model.
If \code{adj.zero=FALSE}, the default is \code{(1,0,0,\dots,0)}.
}
\item{time}{
Specify a single time \code{u} to cause function \code{survest} to be invoked
to plot the probability of surviving until time \code{u} when the fit
is from \code{cph} or \code{psm}.
}
\item{loglog}{
Specify \code{loglog=TRUE} to plot \code{log[-log(survival)]} instead of survival,
when \code{time} is given.
}
\item{val.lev}{
When plotting a categorical or strata factor with category labels that
are strings of legal numeric values, set to \code{TRUE} to use these values in
plotting.  An ordinary axis with uniform spacing will be used rather than
spacing dictated by the value labels.  When \code{val.lev=FALSE}, category
labels dictate how axis tick marks are made.  \code{val.lev} is used
typically when the variable being plotted is a categorical variable
that was collapsed into intervals, with the value label for a category
representing interval means or midpoints.  Such variables are created
for example by the \code{cut2} function, specifying \code{levels.mean=TRUE}.  For
plotting a discrete numeric variable you can specify \code{val.lev=TRUE} to
force plotting of the variable as if it were continuous.
}
\item{digits}{
Controls how ``adjust-to'' values are plotted.  The default is 4 significant
digits.
}
\item{log}{
Set \code{log="x", "y"} or \code{"xy"} to plot log scales on one or both axes.
}
\item{perim}{
names a matrix created by \code{perimeter} when used for 3-d plots of
two continuous predictors.  When the combination of variables is outside
the range in \code{perim}, that section of the plot is suppressed.  If \code{perim}
is omitted, 3-d plotting will use the marginal distributions of the
two predictors to determine the plotting region, when the grid is
not specified explicitly in \code{variables}.  When instead a series of
curves is being plotted, \code{perim} specifies a function having two
arguments.  The first is the vector of values of the first variable that
is about to be plotted on the x-axis.  The second argument is the single
value of the variable representing different curves, for the current
curve being plotted.  The function's returned value must be a logical
vector whose length is the same as that of the first argument, with
values \code{TRUE} if the corresponding point should be plotted for the
current curve, \code{FALSE} otherwise.  See one of the latter examples.
}
\item{method}{
For 3-d plots, use \code{method="persp"} for perspective plots (\code{persp()}, 
the default), \code{method="contour"} to use \code{contour()}, or \code{method="image"}
to use \code{image()}. Specify \code{method="dotchart"} to make a horizontal dot
chart to represent predicted values associated with categorical predictors.
The \code{log} argument does not apply to these plot types.
You can specify \code{method="default"} to get the default behaviour.
For \code{"dotchart"}, the \code{dotchart2} function in the Hmisc library is used.
}
\item{sortdot}{
applies when \code{method="dotchart"}.  The default is to plot the points in
the order requested for predictions.  Specify \code{method="ascending"} or
an abbreviation such as \code{method="a"} to sort in ascending order before
plotting the dot chart.  You may also specify \code{method="descending"}.
Unless \code{method="neither"}, specifying \code{add=TRUE} may not work properly.
}
\item{nlevels}{
number of contour levels if \code{method="contour"}
}
\item{name}{
Instead of specifying the variable to plot on the x-axis in the
\code{variables} list, you can specify one or more variables to plot by
specifying a vector of character string variable names in the
\code{name} argument.  Using this mode you cannot specify a list of
variable values to use; plotting is done as if you had said e.g.
\code{age=NA}.  Also, interacting factors can only be set to their reference values
using this notation.
}
\item{zlim}{
If 'type="persp"' controls the range for plottin in the
z-axis. Computed by default.
}
\item{vnames}{
applies when no x-variable is specified (i.e., when all predictors are
being plotted).  To override the default x-axis label in that case
(variable \code{"label"} attributes) to instead use variable names, specify
\code{vnames="names"}.
}
\item{object}{an object created by \code{plot.Design}}
\item{abbrev}{
Set to \code{TRUE} to use the \code{abbreviate} function to abbreviate levels of
categorical factors for labeling tick marks on the x-axis.
}
\item{size}{
}
\item{horizontal}{
}
\item{nint}{
see \code{image.legend}
}
\item{at}{
If \code{fun} is specified to \code{Legend}, \code{at} may be given.  \code{at} is a vector
of values at which to evaluate \code{fun} for drawing tick marks in the
color legend.  For example, if you want to show the median survival time
for a log-normal survival model whereas the linear predictor (log median)
was used in constructing the image plot, and if you want to place tick
marks at nice median values, specify \code{fun=exp, at=log(c(1,10,100,1000))}.
}
\item{zlab}{
label for \code{image} color axis legend.  Default is from the model
(e.g., \code{"Log Odds"}), but \code{zlab} will often be specified if
\code{fun} was specified to \code{plot.Design} or \code{Legend}.
}
\item{x1}{
data vector for first variable in \code{plot} (\code{x}-axis variable)
}
\item{x2}{
data vector for second variable in \code{plot} if it was not constant
(curve-generating variable)
}
}
\value{
\code{perimeter} returns a matrix of class \code{perimeter}.  This outline can be
conveniently plotted by \code{lines.perimeter}.
\code{Legend.plot.Design} invisibly returns the position of the legend.
\code{plot.Design} invisibly returns an invisible object of class \code{"plot.Design"}
with the following components (use \code{print.plot.Design}
to get a nice printout of the object):

\item{x.xbeta}{
data frame of values plotted. First column is sequence of \code{x}-values. If a
second variable was plotted, second column is sequence of \code{y}-values.
Next column is estimated \eqn{X\beta}{X beta}, followed by a column of lower confidence
limits and upper confidence limits. If fun was specified, these last three
columns are transformed by the specified function.
}
\item{adjust}{
character string of the form \code{"sex=male age=50"} containing settings of
non-plotted factors.
}
\item{curve.labels}{
character vector containing values of \code{y}-variable if it determined different
curves on the plot. E.g., \code{c("female","male")} or \code{c("10","20","30")}.
This vector is useful as an argument to the S \code{key} function if
\code{label.curves=FALSE}.
}
\item{plot.type}{
\code{"curves"} or \code{"3d"}
}
\item{method}{
from \code{plot.Design} call
}
\item{lty}{
vector of line types used for curves (if the plot used a few curves to
represent a second variable plotted)
}
\item{lwd}{
vector of line widths used
}
\item{col}{
vector of color codes
}}
\details{
When there are no intercepts in the fitted model, plot subtracts adjustment values from
each factor while computing variances for confidence limits.


\code{perimeter} is a kind of generalization of \code{datadist} for 2 continuous
variables.  First, the \code{n} smallest and largest \code{x} values are determined.
These form the lowest and highest possible \code{x}s to display.  Then \code{x}
is grouped into intervals bounded by these two numbers, with the interval
widths defined by \code{xinc}.  Within each interval, \code{y} is sorted and the
\eqn{n}th smallest and largest \code{y} are taken as the interval containing
sufficient data density to plot interaction surfaces.  The interval
is ignored when there are insufficient \code{y} values.  When \code{plot.Design}
readies the data for \code{persp}, it uses the \code{approx} function to do
linear interpolation of the \code{y}-boundaries as a function of the
\code{x} values actually used in forming the grid (the values of the
first variable specified to \code{plot}).  To make the perimeter smooth,
specify \code{lowess.=TRUE} to \code{perimeter}.


Specifying \code{time} will not work for Cox models with time-dependent
covariables.  Use \code{survest} or \code{survfit} for that purpose.


Use \code{ps.slide}, \code{win.slide}, \code{gs.slide} to set up nice defaults for
plotting.  These also set a system option \code{mgp.axis.labels} to allow x
and y-axes to have differing \code{mgp} graphical parameters (see \code{par}).
This is important when labels for y-axis tick marks are to be written
horizontally (\code{par(las=1)}), as a larger gap between the labels and
the tick marks are needed.  You can set the axis-specific 2nd
components of \code{mgp} using \code{mgp.axis.labels(c(xvalue,yvalue))}.

Note that because the generic \code{plot} method has the variable
\code{x} as its first argument, you cannot explicitly specify that you
want to plot the effect of a predictor named \code{x}.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{datadist}}, \code{\link{predict.Design}}, \code{\link{contrast.Design}}, \code{\link{summary.Design}}, 
\code{\link{persp}}, \code{\link{Design}}, 
\code{\link{Design.trans}}, \code{\link{survest}}, \code{\link{survplot}}, \code{\link{Design.Misc}}, 
\code{\link{contour}}, \code{\link{image}}, \code{\link[Hmisc]{labcurve}}, \code{\link[Hmisc]{scat1d}}, \code{\link[Hmisc]{dotchart2}}, 
\code{\link[Hmisc]{mgp.axis.labels}} \code{\link[Hmisc]{Overview}}, \code{\link{par}},
\code{\link[Hmisc]{ps.slide}}, \code{\link[Hmisc]{xYplot}}, \code{\link[Hmisc]{smearingEst}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
label(age)            <- 'Age'      # label is in Hmisc
label(cholesterol)    <- 'Total Cholesterol'
label(blood.pressure) <- 'Systolic Blood Pressure'
label(sex)            <- 'Sex'
units(cholesterol)    <- 'mg/dl'   # uses units.default in Hmisc
units(blood.pressure) <- 'mmHg'


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


ddist <- datadist(age, blood.pressure, cholesterol, sex)
options(datadist='ddist')


fit <- lrm(y ~ blood.pressure + sex * (age + rcs(cholesterol,4)),
               x=TRUE, y=TRUE)


par(mfrow=c(2,2))
plot(fit)                # Plot effects of all 4 predictors
par(mfrow=c(1,2))
plot(fit, name=c('age','cholesterol'))   # Make 2 plots
par(mfrow=c(1,1))
plot(fit, age=seq(20,80,length=100), sex=NA, conf.int=FALSE)
                         # Plot relationship between age and log
                         # odds, separate curve for each sex,
                         # no C.I.
z <- plot(fit, age=NA, sex=NA, label.curves=FALSE)
                         # use label.curves=list(keys=c('a','b'))'
                         # to use 1-letter abbreviations
datadensity(z, age, sex) # rug plots (1-dimensional scatterplots)
                         # on each treatment curve, with treatment-
                         # specific density of age
plot(fit, age=seq(20,80,length=100), sex='male')  # works if datadist not used
plot(fit, age=NA, cholesterol=NA)# 3-dimensional perspective plot for age,
                         # cholesterol, and log odds using default
                         # ranges for both variables
boundaries <- perimeter(age, cholesterol, lowess=TRUE)
plot(age, cholesterol)   # show bivariate data density
lines(boundaries)        # and perimeter that will be used for 3-D plot
z <- plot(fit, age=NA, cholesterol=NA, perim=boundaries, method='image')
                         # draws image() plot
                         # don't show estimates where data are sparse
                         # doesn't make sense here since vars don't interact
if(!.R.)Legend(z, fun=plogis, at=qlogis(c(.01,.05,.1,.2,.3,.4,.5)),
               zlab='Probability')   # gray scale or color legend for prob.
plot(fit, age=NA, fun=function(x) 1/(1+exp(-x)) , # or fun=plogis
     ylab="Prob", conf.int=.9)    # Plot estimated probabilities instead of
                                  # log odds


# Plot the age effect as an odds ratio
# comparing the age shown on the x-axis to age=30 years


ddist$limits$age[2] <- 30    # make 30 the reference value for age
# Could also do: ddist$limits["Adjust to","age"] <- 30
fit <- update(fit)   # make new reference value take effect
plot(fit, age=NA, ref.zero=TRUE, fun=exp, ylab='Age=x:Age=30 Odds Ratio')
abline(h=1, lty=2, col=2); abline(v=30, lty=2, col=2)


# Make two curves, and plot the predicted curves as two trellis panels
w <- plot(fit, age=NA, sex=NA)   # Would be nice if a pl=FALSE option was avail.
z <- data.frame(w$x.xbeta)     # Makes variable names legal
if(.R.) library(lattice)
xyplot(log.odds ~ age | sex, data=z, type='l')
# To add confidence bands we need to use the Hmisc xYplot function in
# place of xyplot
xYplot(Cbind(log.odds,lower,upper) ~ age | sex, data=z, 
       method='bands', type='l')
# If non-displayed variables were in the model, add a subtitle to show
# their settings using title(sub=paste('Adjusted to',w$adjust),adj=0)
# See predict.Design for an example using predict and xYplot without plot()




# Plots for a parametric survival model
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n, 
              rep=TRUE, prob=c(.6, .4)))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
t <- -log(runif(n))/h
label(t) <- 'Follow-up Time'
e <- ifelse(t<=cens,1,0)
t <- pmin(t, cens)
units(t) <- "Year"
ddist <- datadist(age, sex)
Srv <- Surv(t,e)


# Fit log-normal survival model and plot median survival time vs. age
f <- psm(Surv(t, e) ~ rcs(age), dist=if(.R.)'lognormal' else 'gaussian')
med <- Quantile(f)       # Creates function to compute quantiles
                         # (median by default)
plot(f, age=NA, fun=function(x)med(lp=x), ylab="Median Survival Time")
# Note: This works because med() expects the linear predictor (X*beta)
#       as an argument.  Would not work if use 
#       plot(\dots, ref.zero=TRUE or adj.zero=TRUE)
# Also, confidence intervals from this method are approximate since
# they don't take into account estimation of scale parameter


# Fit an ols model to log(y) and plot the relationship between x1
# and the predicted mean(y) on the original scale without assuming
# normality of residuals; use the smearing estimator
set.seed(1)
x1 <- runif(300)
x2 <- runif(300)
ddist <- datadist(x1,x2)
y  <- exp(x1+x2-1+rnorm(300))
f <- ols(log(y) ~ pol(x1,2)+x2)
r <- resid(f)
smean <- function(yhat)smearingEst(yhat, exp, res, statistic='mean')
formals(smean) <- list(yhat=numeric(0), res=r[!is.na(r)])
#smean$res <- r[!is.na(r)]   # define default res argument to function
plot(f, x1=NA, fun=smean, ylab='Predicted Mean on y-scale')


options(datadist=NULL)


\dontrun{
# Example in which separate curves are shown for 4 income values
# For each curve the estimated percentage of voters voting for
# the democratic party is plotted against the percent of voters
# who graduated from college.  scat1d is used to indicate
# the income-interval-specific data density for college.  For
# this purpose show the distribution of percent in college for
# those having an income level within +/- the half-width of
# the income interval.  scat1d shows the rug plot superimposed
# on the estimated curve.  Data are county-level percents.
# This can't be done automatically using datadensity on the object
# returned by plot.Design, as the variable representing different
# curves (income) is a continuous variable.


incomes <- seq(22900, 32800, length=4)  
# equally spaced to outer quintiles
pl <- plot(f, college=NA, income=incomes, 
           conf.int=FALSE, xlim=c(0,35), ylim=c(30,55),
           lty=1, lwd=c(.25,1.5,3.5,6), col=c(1,1,2,2))
graph.points <- pl$x.xbeta
for(i in 1:4) {
  college.in.income.group <- college[abs(income-incomes[i]) < 1650]
  this.income <- graph.points[,'income']==incomes[i]
  scat1d(college.in.income.group,
         curve=list(x=graph.points[this.income,'college'],
           y=graph.points[this.income,'democrat']))
}


# Instead of showing a rug plot on each curve, erase end portions
# of each curve where there are fewer than 10 counties having
# \% college graduates to the left of the x-coordinate being plotted,
# for the subset of counties having median family income with 1650
# of the target income for the curve


show.pts <- function(college.pts, income.pt) {
  s <- abs(income - income.pt) < 1650  #assumes income known to top frame
  x <- college[s]
  x <- sort(x[!is.na(x)])
  n <- length(x)
  low <- x[10]; high <- x[n-9]
  college.pts >= low & college.pts <= high
}


plot(f, college=NA, income=incomes,
     conf.int=FALSE, xlim=c(0,35), ylim=c(30,55),
     lty=1, lwd=c(.25,1.5,3.5,6), col=c(1,1,2,2),
     perim=show.pts)
}
}
\keyword{models}
\keyword{hplot}
\keyword{htest}
% Converted by Sd2Rd version 1.21.



\eof
\name{plot.xmean.ordinaly}
\alias{plot.xmean.ordinaly}
\title{
Plot Mean X vs. Ordinal Y
}
\description{
Separately for each predictor variable \eqn{X} in a formula, plots the mean of
\eqn{X} vs. levels of \eqn{Y}.  Then under the proportional odds assumption,
the expected value of the predictor for each \eqn{Y} value is also plotted (as
a dotted line).  This plot is useful for assessing the ordinality assumption 
for \eqn{Y} separately for each \eqn{X}, and for assessing the proportional odds
assumption in a simple univariable way.  If several predictors do not
distinguish adjacent categories of \eqn{Y}, those levels may need to be 
pooled.  This display assumes
that each predictor is linearly related to the log odds of each event in
the proportional odds model.  There is also an option to plot the
expected means assuming a forward continuation ratio model.
}
\usage{
plot.xmean.ordinaly(x, data, subset, na.action, subn=TRUE, cr=FALSE, \dots)
}
\arguments{
\item{x}{
an S formula.  Response variable is treated as ordinal.  Predictor
variables must be binary or continuous.  Interactions or non-linear
effects are not allowed.
}
\item{data}{
a data frame or frame number
}
\item{subset}{
vector of subscripts or logical vector describing subset of data to
analyze
}
\item{na.action}{
defaults to \code{na.keep} so all NAs are initially retained.  Then NAs
are deleted only for each predictor currently being plotted.
Specify \code{na.action=na.delete} to remove observations that are missing
on any of the predictors (or the response).
}
\item{subn}{
set to \code{FALSE} to suppress a left bottom subtitle specifying the sample size
used in constructing each plot
}
\item{cr}{
set to \code{TRUE} to plot expected values by levels of the response, assuming a
forward continuation ratio model holds.  The function is fairly slow
when this option is specified.
}
\item{...}{
other arguments passed to \code{plot} and \code{lines}
}}
\section{Side Effects}{
plots
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Harrell FE et al. (1998): Development of a clinical prediction model for
an ordinal outcome. Stat in Med 17:909--44.
}
\seealso{
\code{\link{lrm}}, \code{\link{residuals.lrm}}, \code{\link{cr.setup}}, \code{\link[Hmisc]{cumcategory}}
}
\examples{
# Simulate data from a population proportional odds model
set.seed(1)
n <- 400
age <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
L <- .2*(age-50) + .1*(blood.pressure-120)
p12 <- plogis(L)    # Pr(Y>=1)
p2  <- plogis(L-1)  # Pr(Y=2)
p   <- cbind(1-p12, p12-p2, p2)   # individual class probabilites
# Cumulative probabilities:
cp  <- matrix(cumsum(t(p)) - rep(0:(n-1), rep(3,n)), byrow=TRUE, ncol=3)
y   <- (cp < runif(n)) \%*\% rep(1,3)
# Thanks to Dave Krantz <dhk@paradox.psych.columbia.edu> for this trick


par(mfrow=c(1,2))
plot.xmean.ordinaly(y ~ age + blood.pressure, cr=TRUE)
par(mfrow=c(1,1))
}
\keyword{category}
\keyword{models}
\keyword{regression}
\keyword{hplot}
\concept{model validation}
\concept{logistic regression model}

\eof
\name{pphsm}
\alias{pphsm}
\alias{print.pphsm}
\title{
Parametric Proportional Hazards form of AFT Models
}
\description{
Translates an accelerated failure time (AFT) model fitted by
\code{psm} to proportional hazards form, if the fitted model was
a Weibull or exponential model (extreme value distribution with
"log" link).
}
\usage{
pphsm(fit)
\method{print}{pphsm}(x, correlation=TRUE, \dots)
}
\arguments{
  \item{fit}{fit object created by \code{psm}}
  \item{x}{result of \code{psm}}
  \item{correlation}{set to \code{FALSE} to suppress printing of
	correlation matrix of parameter estimates}
\item{\dots}{ignored}
}
\value{
a new fit object with transformed parameter estimates
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{psm}}, \code{\link{summary.Design}}, \code{\link{print.pphsm}}
}
\examples{
set.seed(1)
S <- Surv(runif(100))
x <- runif(100)
dd <- datadist(x); options(datadist='dd')
f <- psm(S ~ x, dist="exponential")
summary(f)        # effects on log(T) scale
f.ph <- pphsm(f)
summary(f.ph)     # effects on hazard ratio scale
options(datadist=NULL)
}
\keyword{models}
\keyword{survival}
\keyword{regression}
% Converted by Sd2Rd version 1.21.

\eof
\name{predab.resample}
\alias{predab.resample}
\title{
Predictive Ability using Resampling
}
\description{
\code{predab.resample} is a general-purpose
function that is used by functions for specific models.
It computes estimates of optimism of, and bias-corrected estimates of a vector
of indexes of predictive accuracy, for a model with a specified
design matrix, with or without fast backward step-down of predictors. If \code{bw=TRUE}, the design
matrix \code{x} must have been created by \code{ols}, \code{lrm}, or \code{cph}.
If \code{bw=TRUE}, \code{predab.resample} prints a matrix of asterisks showing which
factors were selected at each repetition, along with a frequency distribution
of the number of factors retained across re-samples.
}
\usage{
predab.resample(fit.orig, fit, measure, 
                method=c("boot","crossvalidation",".632","randomization"),
                bw=FALSE, B=50, pr=FALSE,
                rule="aic", type="residual", sls=.05, aics=0,
                strata=FALSE, tol=1e-12, non.slopes.in.x=TRUE, kint=1,
                cluster, subset, group=NULL, \dots)
}
\arguments{
\item{fit.orig}{
object containing the original full-sample fit, with the \code{x=TRUE} and
\code{y=TRUE} options specified to the model fitting function.  This model
should be the FULL model including all candidate variables ever excluded
because of poor associations with the response.
}
\item{fit}{
a function to fit the model, either the original model fit, or a fit in a
sample.  fit has as arguments \code{x},\code{y}, \code{iter}, \code{penalty}, \code{penalty.matrix},
\code{xcol}, and other arguments passed to \code{predab.resample}. 
If you don't want \code{iter}
as an argument inside the definition of \code{fit}, add \dots to the end of its
argument list. \code{iter} is passed to \code{fit} to inform the function of the
sampling repetition number (0=original sample).  If \code{bw=TRUE}, \code{fit} should
allow for the possibility of selecting no predictors, i.e., it should fit an
intercept-only model if the model has intercept(s). \code{fit} must return
objects \code{coef} and \code{fail} (\code{fail=TRUE} if \code{fit} failed due to singularity or
non-convergence - these cases are excluded from summary statistics). \code{fit}
must add design attributes to the returned object if \code{bw=TRUE}.  
The \code{penalty.matrix} parameter is not used if \code{penalty=0}.  The \code{xcol}
vector is a vector of columns of \code{X} to be used in the current model fit.
For \code{ols} and \code{psm} it includes a \code{1} for the intercept position.
\code{xcol} is not defined if \code{iter=0} unless the initial fit had been from
a backward step-down.  \code{xcol} is used to select the correct rows and columns
of \code{penalty.matrix} for the current variables selected, for example.
}
\item{measure}{
a function to compute a vector of indexes of predictive accuracy for a given fit.
For \code{method=".632"} or \code{method="crossval"}, it will make the most sense for
measure to compute only indexes that are independent of sample size. The
measure function should take the following arguments or use \dots: \code{xbeta} 
(X beta for
current fit), \code{y}, \code{evalfit}, \code{fit}, \code{iter}, and \code{fit.orig}. \code{iter} is as in \code{fit}.
\code{evalfit} is set to \code{TRUE}
by \code{predab.resample} if the fit is being evaluated on the sample used to make the
fit, \code{FALSE} otherwise; \code{fit.orig} is the fit object returned by the original fit on the whole
sample. Using \code{evalfit} will sometimes save computations. For example, in
bootstrapping the area under an ROC curve for a logistic regression model,
\code{lrm} already computes the area if the fit is on the training sample. 
\code{fit.orig}
is used to pass computed configuration parameters from the original fit such as
quantiles of predicted probabilities that are used as cut points in other samples.
The vector created by measure should have \code{names()} associated with it.
}
\item{method}{
The default is \code{"boot"} for ordinary bootstrapping (Efron, 1983, Eq. 2.10).  
Use \code{".632"} for Efron's \code{.632} method (Efron, 1983, Section 6 and Eq. 6.10),
\code{"crossvalidation"}
for grouped cross--validation, \code{"randomization"} for the randomization method. May
be abbreviated down to any level, e.g. \code{"b"}, \code{"."}, \code{"cross"}, \code{"rand"}.
}
\item{bw}{
Set to \code{TRUE} to do fast backward step-down for each training sample. Default is \code{FALSE}.
}
\item{B}{
Number of repetitions, default=50. For \code{method="crossvalidation"}, this is also
the number of groups the original sample is split into.
}
\item{pr}{
\code{TRUE} to print results for each sample. Default is \code{FALSE}.
}
\item{rule}{
Stopping rule for fastbw, \code{"aic"} or \code{"p"}. Default is \code{"aic"} to use Akaike's
information criterion.
}
\item{type}{
Type of statistic to use in stopping rule for fastbw, \code{"residual"} (the default) or
\code{"individual"}.
}
\item{sls}{
Significance level for stopping in fastbw if \code{rule="p"}. Default is \code{.05}.
}
\item{aics}{
Stopping criteria for \code{rule="aic"}. Stops deleting factors when
chi-square - 2 times d.f. falls below \code{aics}. Default is \code{0}.
}
\item{strata}{set to \code{TRUE} if \code{fit.orig} has an \code{x}
  element that contains a \code{"strata"} attribute which is a vector
  that should be sampled the same way as the observations in \code{x}
  and \code{y}}
\item{tol}{
Tolerance for singularity checking.  Is passed to \code{fit} and \code{fastbw}.
}
\item{non.slopes.in.x}{set to \code{FALSE} if the design matrix \code{x}
does not have columns for intercepts and these columns are needed}
\item{kint}{
For multiple intercept models such as the ordinal logistic model, you may
specify which intercept to use as \code{kint}.  This affects the linear
predictor that is passed to \code{measure}.
}
\item{cluster}{
Vector containing cluster identifiers.  This can be specified only if
\code{method="boot"}.  If it is present, the bootstrap is done using sampling
with replacement from the clusters rather than from the original records.
If this vector is not the same length as the number of rows in the data
matrix used in the fit, an attempt will be made to use \code{naresid} on 
\code{fit.orig} to conform \code{cluster} to the data.  
See \code{bootcov} for more about this.
}
\item{subset}{
specify a vector of positive or negative integers or a logical vector when
you want to have the \code{measure} function compute measures of accuracy on
a subset of the data.  The whole dataset is still used for all model development.
For example, you may want to \code{validate} or \code{calibrate} a model by
assessing the predictions on females when the fit was based on males and
females.  When you use \code{cr.setup} to build extra observations for fitting the
continuation ratio ordinal logistic model, you can use \code{subset} to specify
which \code{cohort} or observations to use for deriving indexes of predictive
accuracy.  For example, specify \code{subset=cohort=="all"} to validate the
model for the first layer of the continuation ratio model (Prob(Y=0)).
}
\item{group}{
a grouping variable used to stratify the sample upon bootstrapping.
This allows one to handle k-sample problems, i.e., each bootstrap
sample will be forced to selected the same number of observations from
each level of group as the number appearing in the original dataset.
}
\item{\dots}{
The user may add other arguments here that are passed to \code{fit} and
\code{measure}.
}}
\value{
a matrix with rows corresponding
to indexes computed by \code{measure}, and the following columns:

\item{index.orig}{
indexes in original overall fit
}
\item{training}{
average indexes in training samples
}
\item{test}{
average indexes in test samples
}
\item{optimism}{
average \code{training-test} except for \code{method=".632"} - is .632 times
\code{(index.orig - test)}
}
\item{index.corrected}{
\code{index.orig-optimism}
}
\item{n}{
number of successful repetitions with the given index non-missing
}}
\section{Side Effects}{
prints a summary of the results
}
\details{
For \code{method=".632"}, the program stops with an error if every observation
is not omitted at least once from a bootstrap sample.  Efron's ".632" method
was developed for measures that are formulated in terms on per-observation
contributions.  In general, error measures (e.g., ROC areas) cannot be
written in this way, so this function uses a heuristic extension to
Efron's formulation in which it is assumed that the average error measure
omitting the \code{i}th observation is the same as the average error measure
omitting any other observation.  Then weights are derived
for each bootstrap repetition and weighted averages over the \code{B} repetitions
can easily be computed.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Efron B, Tibshirani R (1997). Improvements on cross-validation: The .632+ bootstrap method.  JASA 92:548--560.
}
\seealso{
\code{\link{Design}}, \code{\link{validate}}, \code{\link{fastbw}}, \code{\link{lrm}}, \code{\link{ols}}, \code{\link{cph}}, \code{\link{bootcov}}, \code{\link[Hmisc]{naresid}}
}
\examples{
# See the code for validate.ols for an example of the use of
# predab.resample
}
\keyword{models}
\concept{model validation}
\concept{bootstrap}
\concept{predictive accuracy}

\eof
\name{predict.Design}
\alias{predict.Design}
\alias{predictDesign}
\alias{predict.bj}
\alias{predict.cph}
\alias{predict.glmD}
\alias{predict.glsD}
\alias{predict.ols}
\alias{predict.psm}

\title{Predicted Values from Model Fit}
\description{
The \code{predict} function is used to obtain a variety of values or
predicted values from either the data used to fit the model (if
\code{type="adjto"} or \code{"adjto.data.frame"} or if \code{x=TRUE} or
\code{linear.predictors=TRUE} were specified to the modeling function), or from
a new dataset. Parameters such as knots and factor levels used in creating 
the design
matrix in the original fit are "remembered".
See the \code{Function} function for another method for computing the
linear predictors.
}
\usage{
\method{predict}{bj}(object, newdata,
        type=c("lp", "x", "data.frame",
                 "terms", "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE, conf.type=c('mean','individual'),
        incl.non.slopes,
        non.slopes, kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=TRUE, \dots) # for bj

\method{predict}{cph}(object, newdata,
        type=c("lp", "x",
                 "data.frame", "terms", "adjto", "adjto.data.frame",
                 "model.frame"),
        se.fit=FALSE, conf.int=FALSE, conf.type=c('mean','individual'),
        incl.non.slopes=NULL,
        non.slopes=NULL, kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=TRUE, \dots) # cph

\method{predict}{glmD}(object, newdata,
        type= c("lp", "x", "data.frame",
                 "terms", "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE, conf.type=c('mean','individual'),
        incl.non.slopes,
        non.slopes, kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=TRUE, \dots) # glmD

\method{predict}{glsD}(object, newdata,
        type=c("lp", "x", "data.frame",
                 "terms", "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE, conf.type=c('mean','individual'),
        incl.non.slopes,
        non.slopes, kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=TRUE, \dots) # glsD

\method{predict}{ols}(object, newdata,
        type=c("lp", "x", "data.frame",
                 "terms", "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE, conf.type=c('mean','individual'),
        incl.non.slopes,
        non.slopes, kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=TRUE, \dots) # ols

\method{predict}{psm}(object, newdata,
        type=c("lp", "x", "data.frame",
                 "terms", "adjto", "adjto.data.frame", "model.frame"),
        se.fit=FALSE, conf.int=FALSE, conf.type=c('mean','individual'),
        incl.non.slopes,
        non.slopes, kint=1, na.action=na.keep, expand.na=TRUE,
        center.terms=TRUE, \dots) # psm
}
\arguments{
\item{object}{a fit object with a \code{Design} fitting function}
\item{newdata}{
An S data frame, list or a matrix specifying new data for which predictions
are desired.  If \code{newdata} is a list, it is converted to a matrix first.
A matrix is converted to a data frame.  For the matrix form, categorical
variables (\code{catg} or \code{strat}) must be coded as integer category
numbers corresponding to the order in which value labels were stored.
For list or matrix forms, \code{matrx} factors must be given a single
value.  If this single value is the S missing value \code{NA}, the adjustment
values of matrx (the column medians) will later replace this value.
If the single value is not \code{NA}, it is propagated throughout the columns
of the \code{matrx} factor.  For \code{factor} variables having numeric levels,
you can specify the numeric values in \code{newdata} without first converting
the variables to factors.  These numeric values are checked to make sure
they match a level, then the variable is converted internally to a \code{factor}.
It is most typical to use a data frame
for newdata, and the S function \code{expand.grid} is very handy here.
For example, one may specify 
\cr
\code{newdata=expand.grid(age=c(10,20,30),}
\cr
   \code{race=c("black","white","other"),}
\cr
   \code{chol=seq(100,300,by=25))}.
}
\item{type}{
Type of output desired.  The default is \code{"lp"} to get the linear predictors -
predicted \eqn{X\beta}{X beta}.  For Cox models, these predictions are centered.
You may specify \code{"x"} to get an expanded design matrix
at the desired combinations of values, \code{"data.frame"} to get an
S data frame of the combinations, \code{"model.frame"} to get a data frame
of the transformed predictors, \code{"terms"} to get a matrix with
each column being the linear combination of variables making up
a factor, \code{"adjto"} to return a vector of \code{limits[2]} (see \code{datadist}) in coded
form, and \code{"adjto.data.frame"} to return a data frame version of these
central adjustment values.  If \code{newdata} is not given, \code{predict}
will attempt to return information stored with the fit object if the
appropriate options were used with the modeling function (e.g., \code{x, y, linear.predictors, se.fit}).
}
\item{se.fit}{
Defaults to \code{FALSE}.  If \code{type="linear.predictors"}, set \code{se.fit=TRUE} to return
a list with components \code{linear.predictors} and \code{se.fit} instead of just
a vector of fitted values.
}
\item{conf.int}{
Specify \code{conf.int} as a positive fraction to obtain upper and lower
confidence intervals (e.g., \code{conf.int=0.95}).  The \eqn{t}-distribution is
used in the calculation for \code{ols} fits.  Otherwise, the normal
critical value is used.
}
\item{conf.type}{
specifies the type of confidence interval.  Default is for the mean.
For \code{ols} fits there is the option of obtaining confidence limits for
individual predicted values by specifying \code{conf.type="individual"}.
}
\item{incl.non.slopes}{
Default is \code{TRUE} if \code{non.slopes} or \code{kint} is specified, the model has a scale
parameter (e.g., a parametric survival model), or \code{type!="x"}.
Otherwise the default is \code{FALSE}.
Set to \code{TRUE} to use an intercept in the prediction if the model has
any intercepts (except for \code{type="terms"} which doesn't need
intercepts).  Set to \code{FALSE} to get predicted \eqn{X\beta}{X beta} ignoring intercepts.
}
\item{non.slopes}{
For models such as the ordinal logistic models containing more than
one intercept, this specifies dummy variable values to pick off intercept(s)
to use in computing predictions.  For example, if there are 3 intercepts,
use \code{non.slopes=c(0,1,0)} to use the second.  Default is
\code{c(1,0,\dots,0)}.  You may alternatively specify \code{kint}.
}
\item{kint}{
a single integer specifying the number of the intercept to use in
multiple-intercept models
}
\item{na.action}{
Function to handle missing values in \code{newdata}.  For predictions
"in data", the same \code{na.action} that was used during model fitting is
used to define an \code{naresid} function to possibly restore rows of the data matrix
that were deleted due to NAs.  For predictions "out of data", the default
\code{na.action} is \code{na.keep}, resulting in NA predictions when a row of
\code{newdata} has an NA.  Whatever \code{na.action} is in effect at the time
for "out of data" predictions, the corresponding \code{naresid} is used also.
}
\item{expand.na}{
set to \code{FALSE} to keep the \code{naresid} from having any effect, i.e., to keep
from adding back observations removed because of NAs in the returned object.
If \code{expand.na=FALSE}, the \code{na.action} attribute will be added to the returned
object.
}
\item{center.terms}{
set to \code{FALSE} to suppress subtracting the mean from columns of the design
matrix before computing terms with \code{type="terms"}.
}
\item{\dots}{ignored}
}
\details{
\code{datadist} and \code{options(datadist=)} should be run before \code{predict.Design}
if using \code{type="adjto"}, \code{type="adjto.data.frame"}, or \code{type="terms"},
or if the fit is a Cox model fit and you are requesting \code{se.fit=TRUE}.
For these cases, the adjustment values are needed (either for the
returned result or for the correct covariance matrix computation).
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{plot.Design}}, \code{\link{summary.Design}}, \code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{predict.lrm}},
\code{\link{residuals.cph}}, \code{\link{naresid}}, \code{\link{datadist}}, \code{\link{gendata}}, \code{\link{Function.Design}}, \code{\link[Hmisc]{reShape}},
\code{\link[Hmisc]{xYplot}}, \code{\link{contrast.Design}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
treat          <- factor(sample(c('a','b','c'), n,TRUE))


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male')) +
  .3*sqrt(blood.pressure-60)-2.3 + 1*(treat=='b')
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


ddist <- datadist(age, blood.pressure, cholesterol, sex, treat)
options(datadist='ddist')


fit <- lrm(y ~ rcs(blood.pressure,4) + 
           sex * (age + rcs(cholesterol,4)) + sex*treat*age)


# Use xYplot to display predictions in 9 panels, with error bars,
# with superposition of two treatments


dat <- expand.grid(treat=levels(treat),sex=levels(sex),
                   age=c(20,40,60),blood.pressure=120,
                   cholesterol=seq(100,300,length=10))
# Add variables linear.predictors and se.fit to dat
dat <- cbind(dat, predict(fit, dat, se.fit=TRUE))
# xYplot in Hmisc extends xyplot to allow error bars
xYplot(Cbind(linear.predictors,linear.predictors-1.96*se.fit,
             linear.predictors+1.96*se.fit) ~ cholesterol | sex*age,
       groups=treat, data=dat, type='b')




# Since blood.pressure doesn't interact with anything, we can quickly and
# interactively try various transformations of blood.pressure, taking
# the fitted spline function as the gold standard. We are seeking a
# linearizing transformation even though this may lead to falsely
# narrow confidence intervals if we use this data-dredging-based transformation


bp <- 70:160
logit <- predict(fit, expand.grid(treat="a", sex='male', age=median(age),
                 cholesterol=median(cholesterol),
                 blood.pressure=bp), type="terms")[,"blood.pressure"]
#Note: if age interacted with anything, this would be the age
#      "main effect" ignoring interaction terms
#Could also use
#   logit <- plot(f, age=ag, \dots)$x.xbeta[,2]
#which allows evaluation of the shape for any level of interacting
#factors.  When age does not interact with anything, the result from
#predict(f, \dots, type="terms") would equal the result from
#plot if all other terms were ignored


plot(bp^.5, logit)               # try square root vs. spline transform.
plot(bp^1.5, logit)              # try 1.5 power
plot(sqrt(bp-60), logit)


#Some approaches to making a plot showing how predicted values
#vary with a continuous predictor on the x-axis, with two other
#predictors varying


combos <- gendata(fit, age=seq(10,100,by=10), cholesterol=c(170,200,230),
                  blood.pressure=c(80,120,160))
#treat, sex not specified -> set to mode
#can also used expand.grid


combos$pred <- predict(fit, combos)
xyplot(pred ~ age | cholesterol*blood.pressure, data=combos, type='l')
xYplot(pred ~ age | cholesterol, groups=blood.pressure, data=combos, type='l')
Key()   # Key created by xYplot
xYplot(pred ~ age, groups=interaction(cholesterol,blood.pressure),
       data=combos, type='l', lty=1:9)
Key()


#Add upper and lower 0.95 confidence limits for individuals
combos <- cbind(combos, predict(fit, combos, conf.int=.95))
xYplot(Cbind(linear.predictors, lower, upper) ~ age | cholesterol,
       groups=blood.pressure, data=combos, type='b')
Key()


# Plot effects of treatments (all pairwise comparisons) vs.
# levels of interacting factors (age, sex)


d <- gendata(fit, treat=levels(treat), sex=levels(sex), age=seq(30,80,by=10))
x <- predict(fit, d, type="x")
betas <- fit$coef
cov   <- fit$var


i <- d$treat=="a"; xa <- x[i,]; Sex <- d$sex[i]; Age <- d$age[i]
i <- d$treat=="b"; xb <- x[i,]
i <- d$treat=="c"; xc <- x[i,]


doit <- function(xd, lab) {
  xb <- xd\%*\%betas
  se <- apply((xd \%*\% cov) * xd, 1, sum)^.5
  q <- qnorm(1-.01/2)   # 0.99 confidence limits
  lower <- xb - q * se; upper <- xb + q * se
  #Get odds ratios instead of linear effects
  xb <- exp(xb); lower <- exp(lower); upper <- exp(upper)
  #First elements of these agree with 
  #summary(fit, age=30, sex='female',conf.int=.99))
  for(sx in levels(Sex)) {
    j <- Sex==sx
    errbar(Age[j], xb[j], upper[j], lower[j], xlab="Age", 
           ylab=paste(lab,"Odds Ratio"), ylim=c(.1,20), log='y')
    title(paste("Sex:",sx))
    abline(h=1, lty=2)
  }
}


par(mfrow=c(3,2), oma=c(3,0,3,0))
doit(xb - xa, "b:a")
doit(xc - xa, "c:a")
doit(xb - xa, "c:b")


# NOTE: This is much easier to do using contrast.Design


\dontrun{
#A variable state.code has levels "1", "5","13"
#Get predictions with or without converting variable in newdata to factor
predict(fit, data.frame(state.code=c(5,13)))
predict(fit, data.frame(state.code=factor(c(5,13))))


#Use gendata function (gendata.Design) for interactive specification of
#predictor variable settings (for 10 observations)
df <- gendata(fit, nobs=10, viewvals=TRUE)
df$predicted <- predict(fit, df)  # add variable to data frame
df


df <- gendata(fit, age=c(10,20,30))  # leave other variables at ref. vals.
predict(fit, df, type="fitted")


# See reShape (in Hmisc) for an example where predictions corresponding to 
# values of one of the varying predictors are reformatted into multiple
# columns of a matrix
}
options(datadist=NULL)
}
\keyword{models}
\keyword{regression}



\eof
\name{predict.lrm}
\alias{predict.lrm}
\title{
Predicted Values for Binary and Ordinal Logistic Models
}
\description{
Computes a variety of types of predicted values for fits from
\code{lrm}, either from the original dataset or for new observations.
}
\usage{
\method{predict}{lrm}(object, \dots, type=c("lp", "fitted", "fitted.ind", "mean", "x", 
            "data.frame", "terms", "adjto","adjto.data.frame", 
            "model.frame"), se.fit=FALSE, codes=FALSE)
}
\arguments{
\item{object}{
a object created by \code{lrm}
}
\item{...}{
arguments passed to \code{predict.Design}, such as \code{kint} and \code{newdata}
(which is used if you are predicting \code{out of data}).  See
\code{predict.Design} to see how NAs are handled.
}
\item{type}{
See \code{predict.Design} for \code{"x", "data.frame", "terms", "adjto", 
"adjto.data.frame"} and \code{"model.frame"}. \code{type="lp"} is used to get
linear predictors (always using the first intercept). \code{type="fitted"}
is used to get all the probabilities \eqn{Y\geq
  j}. \code{type="fitted.ind"} gets all the individual probabilities
\eqn{Y=j}. For an ordinal response variable, \code{type="mean"} computes
the estimated mean \eqn{Y} by summing values of \eqn{Y} 
multiplied by the estimated \eqn{Prob(Y=j)}. If \eqn{Y} was a character or
\code{factor} object, the levels are the character values or factor levels,
so these must be translatable to numeric, unless \code{codes=TRUE}.
See the Hannah and Quigley reference below for the method of estimating
(and presenting) the mean score.  If you specify
\code{type="fitted","fitted.ind","mean"} you may not specify \code{kint}.
}
\item{se.fit}{
applies only to \code{type="lp"}, to get standard errors.
}
\item{codes}{
if \code{TRUE}, \code{type="mean"} uses the integer codes
\eqn{1,2,\ldots,k} for the \eqn{k}-level response in computing the
predicted mean response.
}
}
\value{
a vector (\code{type="lp"} with \code{se.fit=FALSE}, or \code{type="mean"} or only one
observation being predicted), a list (with elements \code{linear.predictors}
and \code{se.fit} if \code{se.fit=TRUE}), a matrix (\code{type="fitted"} or \code{type="fitted.ind"}),
a data frame, or a design matrix.  
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Hannah M, Quigley P: Presentation of ordinal regression analysis on the original
scale.  Biometrics 52:771--5; 1996.
}
\seealso{
\code{\link{lrm}}, \code{\link{predict.Design}}, \code{\link{naresid}}, \code{\link{contrast.Design}}
}
\examples{
# See help for predict.Design for several binary logistic
# regression examples


# Examples of predictions from ordinal models
set.seed(1)
y <- factor(sample(1:3, 400, TRUE), 1:3, c('good','better','best'))
x1 <- runif(400)
x2 <- runif(400)
f <- lrm(y ~ rcs(x1,4)*x2)
predict(f, type="fitted.ind")[1:10,]   #gets Prob(better) and all others
d <- data.frame(x1=.5,x2=.5)
predict(f, d, type="fitted")        # Prob(Y>=j) for new observation
predict(f, d, type="fitted.ind")    # Prob(Y=j)
predict(f, d, type='mean', codes=TRUE) # predicts mean(y) using codes 1,2,3
}
\keyword{models}
\keyword{regression}
\concept{logistic regression model}

\eof
\name{print.cph}
\alias{print.cph}
\title{
Print cph Results
}
\description{
Formatted printing of an object of class \code{cph}. Prints strata
frequencies, parameter estimates, standard errors, z-statistics, numbers
of missing values, VIFs.
}
\usage{
\method{print}{cph}(x, long=FALSE, digits=3, conf.int=FALSE, table=TRUE, \dots)
}
\arguments{
\item{x}{
fit object
}
\item{long}{
set to \code{TRUE} to print the centering constant
}
\item{digits}{
number of significant digits to print
}
\item{conf.int}{
set to e.g. .95 to print 0.95 confidence intervals on simple hazard ratios
}
\item{table}{
set tp \code{FALSE} to suppress event frequency statistics
}
\item{...}{
arguments passed to \code{print.cphfit} : \code{coef}, \code{scale}
}}
\seealso{
\code{\link[survival]{print.coxph}}
}
\keyword{print}


\eof
\name{print.cph.fit}
\alias{print.cph.fit}
\title{
Print cph.fit
}
\description{
Formatted printing of an object of class \code{cph.fit} created
by \code{cph.fit} (which is usually called by \code{cph}). Most of the
logic for \code{print.cph.fit} came from Therneau's \code{print.coxreg}.
}
\usage{
\method{print}{cph.fit}(x, table=TRUE, coef=TRUE, conf.int=FALSE, scale=1, digits=NULL,\dots)
}
\arguments{
\item{x}{
object created by \code{cph.fit}
}
\item{table}{
print table of frequencies of events, by strata (if any)
}
\item{coef}{
print coefficient estimates, standard errors, and z-statistics
}
\item{conf.int}{
set to e.g. .95 to print 0.95 confidence intervals. Default is \code{FALSE}
to suppress confidence intervals
}
\item{scale}{
constant by which to multiply coefficients and standard errors if
printing confidence intervals.
}
\item{digits}{
number of significant digits to print
}
\item{\dots}{ignored}
}
\seealso{
\code{\link[survival]{print.coxph}}
}
\keyword{print}

\eof
\name{print.lrm}
\alias{print.lrm}
\title{
Print lrm
}
\description{
Formatted printing of an object of class \code{lrm}
}
\usage{
\method{print}{lrm}(x, digits=4, strata.coefs=FALSE, \dots)
}
\arguments{
\item{x}{fit object}
\item{digits}{number of significant digits to use}
\item{strata.coefs}{set to \code{TRUE} to print the (experimental)
  strata coefficients}
\item{\dots}{ignored}
}
\keyword{print}

\eof
\name{print.ols}
\alias{print.ols}
\title{Print ols}
\description{
formatted printing of an object of class \code{ols} using methods taken from
\code{print.lm} and \code{summary.lm}. Prints R-squared, adjusted R-squared,
parameter estimates,
standard errors, and t-statistics (Z statistics if penalized estimation was used).
For penalized
estimation, prints the maximum penalized likelihood estimate of the residual
standard deviation (\code{Sigma}) instead of the usual root mean squared error.
}
\usage{
\method{print}{ols}(x, digits=4, long=FALSE, \dots)
}
\arguments{
\item{x}{fit object}
\item{digits}{number of significant digits to print}
\item{long}{set to \code{TRUE} to print the correlation matrix of
  parameter estimates}
\item{\dots}{other parameters to pass to \code{print} or \code{format}}
}
\seealso{
\code{\link{ols}}, \code{\link{print.lm}}, \code{\link{summary.lm}}
}
\keyword{print}

\eof
\name{psm}
\alias{psm}
\alias{print.psm}
\alias{Hazard}
\alias{Survival}
\alias{Hazard.psm}
\alias{Mean.psm}
\alias{Quantile.psm}
\alias{Survival.psm}
\alias{residuals.psm}
\alias{lines.residuals.psm.censored.normalized}
\alias{survplot.residuals.psm.censored.normalized}
\title{
Parametric Survival Model
}
\description{

  \code{psm} is a modification of Therneau's \code{survreg} function for
  fitting the accelerated failure time family of parametric survival
  models.  \code{psm} uses the \code{Design} class for automatic
  \code{anova}, \code{fastbw}, \code{calibrate}, \code{validate}, and
  other functions.  \code{Hazard.psm}, \code{Survival.psm},
  \code{Quantile.psm}, and \code{Mean.psm} create S functions that
  evaluate the hazard, survival, quantile, and mean (expected value)
  functions analytically, as functions of time or probabilities and the
  linear predictor values.

The \code{residuals.psm} function exists mainly to compute normalized
(standardized) residuals and to censor them (i.e., return them as
\code{Surv} objects) just as the original failure time variable was
censored.  These residuals are useful for checking the underlying
distributional assumption (see the examples).  To get these residuals,
the fit must have specified \code{y=TRUE}.  A \code{lines} method for these
residuals automatically draws a curve with the assumed standardized
survival distribution.  A \code{survplot} method runs the standardized
censored residuals through \code{survfit} to get Kaplan-Meier estimates,
with optional stratification (automatically grouping a continuous
variable into quantiles) and then through \code{survplot.survfit} to plot
them.  Then \code{lines} is invoked to show the theoretical curve.  Other
types of residuals are computed by \code{residuals} using
\code{residuals.survreg}.

Older versions of \code{survreg} used by \code{psm} (e.g., on S-Plus
2000) had the following additional arguments \code{method, link, parms,
  fixed}.  See \code{\link{survreg}} on such systems for details.
\code{psm} passes those arguments to \code{survreg}.
}
\usage{
psm(formula=formula(data),
    data=if (.R.) parent.frame() else sys.parent(), weights,
    subset, na.action=na.delete, dist="weibull",
    init=NULL, scale=0, 
    control=if(!.R.) survReg.control() else survreg.control(),
    parms=NULL, 
    model=FALSE, x=FALSE, y=TRUE, time.inc, \dots)
# dist=c("extreme", "logistic", "gaussian", "exponential", 
#        "rayleigh", "t")      for S-Plus before 5.0
# dist=c("extreme", "logistic", "gaussian", "weibull",
#        "exponential", "rayleigh", "lognormal",
#        "loglogistic" "t")    for R, S-Plus 5,6
# Older versions had arguments method, link, parms, fixed

\method{print}{psm}(x, correlation=FALSE, \dots)

Hazard(object, \dots)
\method{Hazard}{psm}(object, \dots)   # for psm fit
# E.g. lambda <- Hazard(fit)

Survival(object, \dots)
\method{Survival}{psm}(object, \dots) # for psm
# E.g. survival <- Survival(fit)

\method{Quantile}{psm}(object, \dots) # for psm
# E.g. quantsurv <- Quantile(fit)

\method{Mean}{psm}(object, \dots)     # for psm
# E.g. meant   <- Mean(fit)

# lambda(times, lp)   # get hazard function at t=times, xbeta=lp
# survival(times, lp) # survival function at t=times, lp
# quantsurv(q, lp)    # quantiles of survival time
# meant(lp)           # mean survival time

\method{residuals}{psm}(object, type="censored.normalized", \dots)

\method{survplot}{residuals.psm.censored.normalized}(fit, x, g=4, col, main, \dots)

\method{lines}{residuals.psm.censored.normalized}(x, n=100, lty=1, xlim,
lwd=3, \dots)
# for type="censored.normalized"
}
\arguments{
\item{formula}{
an S statistical model formula. Interactions up to third order are
supported. The left hand side must be a \code{Surv} object.
}
\item{object}{a fit created by \code{psm}.  For \code{survplot} with
  residuals from \code{psm}, \code{object} is the result of
  \code{residuals.psm}.
}
\item{fit}{a fit created by \code{psm}}
\item{data}{}
\item{subset}{}
\item{weights}{}
\item{dist}{}
\item{scale}{}
\item{init}{}
\item{na.action}{}
\item{control}{see \code{survreg} (\code{survReg} for S-Plus 5. or 6.).
  \code{fixed} is used for S-Plus before 5., \code{parms} is used for
  S-Plus 5, 6, and \R.  See \code{cph} for \code{na.action}. 
}
\item{parms}{a list of fixed parameters.  For the \eqn{t}-distribution
  this is the degrees of freedom; most of the distributions have no
  parameters.}
\item{model}{
set to \code{TRUE} to include the model frame in the returned object
}
\item{x}{
set to \code{TRUE} to include the design matrix in the object produced
by \code{psm}.  For the \code{survplot} method, \code{x} is an optional
stratification variable (character, numeric, or categorical).  For
\code{lines.residuals.psm.censored.normalized}, \code{x} is the result
of \code{residuals.psm}.  For \code{print} it is the result of \code{psm}.
}
\item{y}{
set to \code{TRUE} to include the \code{Surv()} matrix
}
\item{time.inc}{
setting for default time spacing. Used in constructing time axis
in \code{survplot}, and also in make confidence bars. Default is 30
if time variable has \code{units="Day"}, 1 otherwise, unless
maximum follow-up time \eqn{< 1}. Then max time/10 is used as \code{time.inc}.
If \code{time.inc} is not given and max time/default \code{time.inc} is
\eqn{> 25}, \code{time.inc} is increased.
}
\item{correlation}{set to \code{TRUE} to print the correlation matrix
  for parameter estimates}
\item{\dots}{
other arguments to fitting routines, or to pass to \code{survplot} from
\cr
\code{survplot.residuals.psm.censored.normalized}.  Ignored for
\code{lines}.}
\item{times}{
a scalar or vector of times for which to evaluate survival probability
or hazard
}
\item{lp}{
a scalar or vector of linear predictor values at which to evaluate
survival probability or hazard.  If both \code{times} and \code{lp} are
vectors, they must be of the same length.
}
\item{q}{
a scalar or vector of probabilities.  The default is .5, so just the
median survival time is returned.  If \code{q} and \code{lp} are both vectors,
a matrix of quantiles is returned, with rows corresponding to \code{lp}
and columns to \code{q}.
}
\item{type}{
type of residual desired.  Default is censored normalized residuals,
defined as (link(Y) - linear.predictors)/scale parameter, where the
link function was usually the log function.  See \code{survreg} for other
types (\code{survReg} for S-Plus 6).
}
\item{n}{
number of points to evaluate theoretical standardized survival
function for 
\cr
\code{lines.residuals.psm.censored.normalized}
}
\item{lty}{
line type for \code{lines}, default is 1
}
\item{xlim}{
range of times (or transformed times) for which to evaluate the standardized
survival function.  Default is range in normalized residuals.
}
\item{lwd}{
line width for theoretical distribution, default is 3
}
\item{g}{
number of quantile groups to use for stratifying continuous variables
having more than 5 levels
}
\item{col}{
vector of colors for \code{survplot} method, corresponding to levels of \code{x}
(must be a scalar if there is no \code{x})
}
\item{main}{
main plot title for \code{survplot}.  If omitted, is the name or label of
\code{x} if \code{x} is given.  Use \code{main=""} to suppress a title when you
specify \code{x}.
}}
\value{
\code{psm} returns a fit object with all the information \code{survreg} would store as 
well as what \code{Design} stores and \code{units} and \code{time.inc}.
\code{Hazard}, \code{Survival}, and \code{Quantile} return S-functions.
\code{residuals.psm} with \code{type="censored.normalized"} returns a \code{Surv} object
which has a special attribute \code{"theoretical"} which is used by the \code{lines}
routine.  This is the assumed standardized survival function as a function
of time or transformed time.
}
\details{
The object \code{survreg.distributions} contains definitions of properties
of the various survival distributions. 
\cr
\code{psm} does not trap singularity errors due to the way \code{survreg.fit}
does matrix inversion.  It will trap non-convergence (thus returning
\code{fit$fail=TRUE}) if you give the argument \code{failure=2} inside the
\code{control} list which is passed to \code{survreg.fit}.  For example, use
\code{f <- psm(S ~ x, control=list(failure=2, maxiter=20))} to allow up to
20 iterations and to set \code{f$fail=TRUE} in case of non-convergence.
This is especially useful in simulation work.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{Design}}, \code{\link{survreg}}, \code{\link{survReg}}, \code{\link{residuals.survreg}}, \code{\link{survreg.object}}, 
\code{\link{survreg.distributions}},
\code{\link{pphsm}}, \code{\link{survplot}}, \code{\link{survest}}, \code{\link[survival]{Surv}},
\code{\link[Hmisc]{na.delete}}, \code{\link[Hmisc]{na.detail.response}}, \code{\link{datadist}}, \code{\link{latex.psm}}
}
\examples{
n <- 400
set.seed(1)
age <- rnorm(n, 50, 12)
sex <- factor(sample(c('Female','Male'),n,TRUE))
dd <- datadist(age,sex)
options(datadist='dd')
# Population hazard function:
h <- .02*exp(.06*(age-50)+.8*(sex=='Female'))
d.time <- -log(runif(n))/h
cens <- 15*runif(n)
death <- ifelse(d.time <= cens,1,0)
d.time <- pmin(d.time, cens)


f <- psm(Surv(d.time,death) ~ sex*pol(age,2), 
         dist=if(.R.)'lognormal' else 'gaussian')
# Log-normal model is a bad fit for proportional hazards data


anova(f)
fastbw(f)  # if deletes sex while keeping age*sex ignore the result
f <- update(f, x=TRUE,y=TRUE)       # so can validate, compute certain resids
validate(f, dxy=TRUE, B=10)      # ordinarily use B=150 or more
plot(f, age=NA, sex=NA)       # needs datadist since no explicit age, hosp.
survplot(f, age=c(20,60))     # needs datadist since hospital not set here
# latex(f)


S <- Survival(f)
plot(f$linear.predictors, S(6, f$linear.predictors),
     xlab=if(.R.)expression(X*hat(beta)) else 'X*Beta',
     ylab=if(.R.)expression(S(6,X*hat(beta))) else 'S(6|X*Beta)')
# plots 6-month survival as a function of linear predictor (X*Beta hat)


times <- seq(0,24,by=.25)
plot(times, S(times,0), type='l')   # plots survival curve at X*Beta hat=0
lam <- Hazard(f)
plot(times, lam(times,0), type='l') # similarly for hazard function


med <- Quantile(f)        # new function defaults to computing median only
lp <- seq(-3, 5, by=.1)
plot(lp, med(lp=lp), ylab="Median Survival Time")
med(c(.25,.5), f$linear.predictors)
                          # prints matrix with 2 columns


# fit a model with no predictors
f <- psm(Surv(d.time,death) ~ 1, dist=if(.R.)"weibull" else "extreme")
f
pphsm(f)          # print proportional hazards form
g <- survest(f)
plot(g$time, g$surv, xlab='Time', type='l',
     ylab=if(.R.)expression(S(t)) else 'S(t)')


f <- psm(Surv(d.time,death) ~ age, 
         dist=if(.R.)"loglogistic" else "logistic", y=TRUE)
r <- resid(f, 'cens') # note abbreviation
survplot(survfit(r), conf='none') 
                      # plot Kaplan-Meier estimate of 
                      # survival function of standardized residuals
survplot(survfit(r ~ cut2(age, g=2)), conf='none')  
                      # both strata should be n(0,1)
lines(r)              # add theoretical survival function
#More simply:
survplot(r, age, g=2)


options(datadist=NULL)
}
\keyword{models}
\keyword{survival}

\eof
\name{residuals.cph}
\alias{residuals.cph}
\title{
Residuals for a cph Fit
}
\description{
Calculates martingale, deviance, score or Schoenfeld residuals 
(scaled or unscaled) or influence statistics for a
Cox proportional hazards model. This is a slightly modified version
of Therneau's \code{residuals.coxph} function. It assumes that \code{x=TRUE} and
\code{y=TRUE} were specified to \code{cph}, except for martingale residuals, which
are stored with the fit by default.
}
\usage{
\method{residuals}{cph}(object,
      type=c("martingale", "deviance", "score", "schoenfeld", 
             "dfbeta", "dfbetas", "scaledsch"), collapse, weighted, \dots)
}
\arguments{
\item{object}{a \code{cph} object}
\item{type}{
character string indicating the type of residual desired;
the default is martingale.
Only enough of the string to determine a unique match is required.
Instead of the usual residuals, \code{type="dfbeta"} may be specified
to obtain approximate leave-out-one \eqn{\Delta \beta}s.  Use
\code{type="dfbetas"} to normalize the \eqn{\Delta \beta}s for
the standard errors of the regression coefficient estimates.
Scaled Schoenfeld residuals (\code{type="scaledsch"}, Grambsch and
Therneau, 1993) better 
reflect the log hazard ratio function than ordinary Schoenfeld
residuals, and they are on the regression coefficient scale.  
The weights use Grambsch and Therneau's "average variance" method.
}
\item{collapse}{
Vector indicating which rows to collapse(sum) over.  In time-dependent
models more than one row data can pertain to a single individual.
If there were 4 individuals represented by 3, 1, 2 and 4 rows of data
respectively, then \code{collapse=c(1,1,1, 2, 3,3, 4,4,4,4)} could be used to
obtain per subject rather than per observation residuals.
}
\item{weighted}{ignored; only accepts \code{FALSE}}
\item{\dots}{unused}
}
\value{
The object returned will be a vector for martingale and deviance 
residuals and matrices for score and schoenfeld residuals, dfbeta, or dfbetas.
There will
be one row of residuals for each row in the input data (without \code{collapse}).
One column of score and Schoenfeld
residuals will be returned for each column in the model.matrix.
The scaled Schoenfeld residuals are used in the \code{cox.zph} function.


The score residuals are each individual's contribution to the score
vector.  Two transformations of this are often more useful: \code{dfbeta} is
the approximate change in the coefficient vector if that observation
were dropped, and \code{dfbetas} is the approximate change in the coefficients,
scaled by the standard error for the coefficients.
}
\references{
T. Therneau, P. Grambsch, and T.Fleming. "Martingale based residuals
for survival models", Biometrika, March 1990.


P. Grambsch, T. Therneau. "Proportional hazards tests and diagnostics
based on weighted residuals", unpublished manuscript, Feb 1993.
}
\seealso{
\code{\link{cph}}, \code{\link[survival]{coxph}}, \code{\link[survival]{residuals.coxph}}, \code{\link{cox.zph}}, \code{\link{naresid}}
}
\examples{
# fit <- cph(Surv(start, stop, event) ~ (age + surgery)* transplant, 
#            data=jasa1)
# mresid <- resid(fit, collapse=jasa1$id)


# Get unadjusted relationships for several variables
# Pick one variable that's not missing too much, for fit


n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
cens   <- 15*runif(n)
h      <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
d.time <- -log(runif(n))/h
death  <- ifelse(d.time <= cens,1,0)
d.time <- pmin(d.time, cens)


f <- cph(Surv(d.time, death) ~ age + blood.pressure + cholesterol, iter.max=0)
res <- resid(f) # This re-inserts rows for NAs, unlike f$resid
yl <- quantile(res, c(10/length(res),1-10/length(res)), na.rm=TRUE)
# Scale all plots from 10th smallest to 10th largest residual
par(mfrow=c(2,2), oma=c(3,0,3,0))
p <- function(x) {
  s <- !is.na(x+res)
  plot(lowess(x[s], res[s], iter=0), xlab=label(x), ylab="Residual",
       ylim=yl, type="l")
}
p(age); p(blood.pressure); p(cholesterol)
mtext("Smoothed Martingale Residuals", outer=TRUE)


# Assess PH by estimating log relative hazard over time
f <- cph(Surv(d.time,death) ~ age + sex + blood.pressure, x=TRUE, y=TRUE)
r <- resid(f, "scaledsch")
tt <- as.numeric(dimnames(r)[[1]])
par(mfrow=c(3,2))
for(i in 1:3) {
  g <- areg.boot(I(r[,i]) ~ tt, B=20)
  plot(g, boot=FALSE)  # shows bootstrap CIs
}                  # Focus on 3 graphs on right
# Easier approach:
plot(cox.zph(f))    # invokes plot.cox.zph
par(mfrow=c(1,1))
}
\keyword{survival}
\concept{model validation}

\eof
\name{residuals.lrm}
\alias{residuals.lrm}
\alias{plot.lrm.partial}
\title{Residuals from a Logistic Regression Model Fit}
\description{
For a binary logistic model fit, computes the following residuals, letting
\eqn{P} denote the predicted probability of the higher category of \eqn{Y},
\eqn{X} denote the design matrix (with a column of 1s for the intercept), and
\eqn{L} denote the logit or linear predictors: ordinary (\eqn{Y-P}), score
(\eqn{X (Y-P)}), pearson (\eqn{(Y-P)/\sqrt{P(1-P)}}), deviance (for \eqn{Y=0} is
\eqn{-\sqrt{2|\log(1-P)|}}, for \eqn{Y=1} is \eqn{\sqrt{2|\log(P)|}},
pseudo dependent variable used in influence statistics 
(\eqn{L + (Y-P)/(P(1-P))}), and partial (\eqn{X_{i}\beta_{i} +
  (Y-P)/(P(1-P))}). 


Will compute all these residuals for an ordinal logistic model, using
as temporary binary responses dichotomizations of \eqn{Y}, along with the corresponding
\eqn{P}, the probability that \eqn{Y \geq} cutoff.  For
\code{type="partial"}, all 
possible dichotomizations are used, and for \code{type="score"}, the actual
components of the first derivative of the log likelihood are used for
an ordinal model.  Alternatively, specify \code{type="score.binary"}
to use binary model score residuals but for all cutpoints of \eqn{Y}
(plotted only, not returned). The \code{score.binary}, 
\code{partial}, and perhaps \code{score} residuals are useful for checking the proportional odds assumption.
If the option \code{pl=TRUE} is used to plot the \code{score} or \code{score.binary} residuals, 
a score residual plot is
made for each column of the design (predictor) matrix, with \code{Y} cutoffs on the
x-axis and the mean +- 1.96 standard errors of the score residuals on
the y-axis.  You can instead use a box plot to display these residuals,
for both \code{score.binary} and \code{score}.
Proportional odds dictates a horizontal \code{score.binary} plot.  Partial
residual plots use smooth nonparametric estimates, separately for each
cutoff of \eqn{Y}.  One examines that plot for parallelism of the curves
to check the proportional odds assumption, as
well as to see if the predictor behaves linearly.


Also computes a variety of influence statistics and the 
le Cessie - van Houwelingen - Copas - Hosmer unweighted sum of squares test
for global goodness of fit, done separately for each cutoff of \eqn{Y} in the
case of an ordinal model.


The \code{plot.lrm.partial} function computes partial residuals for a series
of binary logistic model fits that all used the same predictors and that
specified \code{x=TRUE, y=TRUE}.  It then computes smoothed partial residual
relationships (using \code{lowess} with \code{iter=0}) and plots them separately
for each predictor, with residual plots from all model fits shown on the
same plot for that predictor.
}
\usage{
\method{residuals}{lrm}(object, type=c("ordinary", "score", "score.binary",
                  "pearson", "deviance", "pseudo.dep", "partial",
                  "dfbeta","dfbetas","dffit","dffits","hat","gof","lp1"),
           pl=FALSE, xlim, ylim, kint, label.curves=TRUE, which, \dots)

plot.lrm.partial(\dots, labels, center=FALSE)
}
\arguments{
\item{object}{
object created by \code{lrm}
}
\item{\dots}{
for \code{residuals}, applies to \code{type="partial"}  when \code{pl}
is not \code{FALSE}.  These are extra arguments passed to the smoothing
function.  Can also be used to pass extra arguments to \code{boxplot}
for \code{type="score"} or \code{"score.binary"}. 
For \code{plot.lrm.partial} this specifies a series of binary model fit
objects.
}
\item{type}{
type of residual desired.  Use \code{type="lp1"} to get approximate leave-out-1
linear predictors, derived by subtracting the \code{dffit} from the original
linear predictor values.
}
\item{pl}{
applies only to \code{type="partial"}, \code{"score"}, and \code{"score.binary"}.  
For score residuals in an ordinal model, set \code{pl=TRUE} to get means and 
approximate 0.95 confidence bars
vs. \eqn{Y}, separately for each \eqn{X}.  Alternatively, specify
\code{pl="boxplot"} to use \code{boxplot} to
draw the plot, with notches and with width proportional to the square
root of the cell sizes.
For partial residuals, set \code{pl=TRUE} (which uses \code{lowess}) or \code{pl="supsmu"}
to get smoothed partial
residual plots for all columns of \eqn{X} using \code{supsmu}.
Use \code{pl="loess"} to use \code{loess} and get confidence bands (\code{"loess"} is not
implemented for ordinal responses).  Under R, \code{pl="loess"} uses
\code{lowess} and does not provide confidence bands.
If there is more than one \eqn{X},
you should probably use \code{par(mfrow=c( , ))} before calling \code{resid}.
Note that \code{pl="loess"} results in \code{plot.loess} being called, which
requires a large memory allocation.
}
\item{xlim}{
plotting range for x-axis (default = whole range of predictor)
}
\item{ylim}{
plotting range for y-axis (default = whole range of residuals,
range of all confidence intervals for \code{score} or \code{score.binary} or range
of all smoothed curves for \code{partial} if
\code{pl=TRUE}, or 0.1 and 0.9 quantiles of the residuals for \code{pl="boxplot"}.)
}
\item{kint}{
for an ordinal model for residuals other than \code{partial}, \code{score}, or
\code{score.binary}, specifies
the intercept (and the cutoff of \eqn{Y}) to use for the calculations.
Specifying \code{kint=2}, for example, means to use \eqn{Y \geq} 3rd level.
}
\item{label.curves}{
set to \code{FALSE} to suppress curve labels when \code{type="partial"}.  The default,
\code{TRUE}, causes \code{labcurve} to be invoked to label curves where they are most
separated.  \code{label.curves} can be a list containing the \code{opts} parameter
for \code{labcurve}, to send options to \code{labcurve}, such as \code{tilt}.  The
default for \code{tilt} here is \code{TRUE}.
}
\item{which}{
a vector of integers specifying column numbers of the design matrix for which to compute or plot residuals, for \code{type="partial","score","score.binary"}.
}
\item{labels}{
for \code{plot.lrm.partial} this specifies a vector of character strings 
providing labels for the list of binary fits.  By default, the names of
the fit objects are used as labels.  The \code{labcurve} function is used
to label the curve with the \code{labels}.
}
\item{center}{
for \code{plot.lrm.partial} this causes partial residuals for every model to have a mean of zero before smoothing and plotting
}}
\value{
a matrix (\code{type="partial","dfbeta","dfbetas","score"}), 
test statistic (\code{type="gof"}), or a vector otherwise.  
For partial residuals from an ordinal
model, the returned object is a 3-way array (rows of \eqn{X} by columns
of \eqn{X} by cutoffs of \eqn{Y}), and NAs deleted during the fit
are not re-inserted into the residuals.  For \code{score.binary}, nothing
is returned.
}
\details{
For the goodness-of-fit test, the le Cessie-van Houwelingen normal test
statistic for the unweighted sum of squared errors (Brier score times \eqn{n})
is used.  For an ordinal response variable, the test 
for predicting the probability that \eqn{Y\geq j} is done separately for
all \eqn{j} (except the first).  Note that the test statistic can have strange behavior 
(i.e., it is far too large) if the model has no predictive value.


For most of the values of \code{type}, you must have specified \code{x=TRUE, y=TRUE} to
\code{lrm}.


There is yet no literature on interpreting score residual plots for the
ordinal model.  Simulations when proportional odds is satisfied have
still shown a U-shaped residual plot.  The series of binary model score
residuals for all cutoffs of \eqn{Y} seems to better check the assumptions.
See the last example.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Landwehr, Pregibon, Shoemaker. JASA 79:61--83, 1984.


le Cessie S, van Houwelingen JC. Biometrics 47:1267--1282, 1991.


Hosmer DW, Hosmer T, Lemeshow S, le Cessie S, Lemeshow S.  A
comparison of goodness-of-fit tests for the logistic regression model.
Stat in Med 16:965--980, 1997.


Copas JB.  Applied Statistics 38:71--80, 1989.
}
\seealso{
\code{\link{lrm}}, \code{\link{naresid}}, \code{\link{which.influence}},
\code{\link[modreg]{loess}}, \code{\link{supsmu}}, \code{\link{lowess}},
\code{\link{boxplot}}, \code{\link[Hmisc]{labcurve}}
}
\examples{
set.seed(1)
x1 <- runif(200, -1, 1)
x2 <- runif(200, -1, 1)
L  <- x1^2 - .5 + x2
y  <- ifelse(runif(200) <= plogis(L), 1, 0)
f <- lrm(y ~ x1 + x2, x=TRUE, y=TRUE)
resid(f)            #add rows for NAs back to data
resid(f, "score")   #also adds back rows
r <- resid(f, "partial")  #for checking transformations of X's
par(mfrow=c(1,2))
for(i in 1:2) {
  xx <- if(i==1)x1 else x2
  if(.R.) {
    plot(xx, r[,i], xlab=c('x1','x2')[i])
    lines(lowess(xx,r[,i]))
  } else {
    g <- loess(r[,i] ~ xx)
    plot(g, coverage=0.95, confidence=7)
    points(xx, r[,i])
  }
}
resid(f, "partial", pl="loess")  #same as last 3 lines
resid(f, "partial", pl=TRUE) #plots for all columns of X using supsmu
resid(f, "gof")           #global test of goodness of fit
lp1 <- resid(f, "lp1")    #approx. leave-out-1 linear predictors
-2*sum(y*lp1 + log(1-plogis(lp1)))  #approx leave-out-1 deviance
                                    #formula assumes y is binary


# Simulate data from a population proportional odds model
set.seed(1)
n   <- 400
age <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
L <- .05*(age-50) + .03*(blood.pressure-120)
p12 <- plogis(L)    # Pr(Y>=1)
p2  <- plogis(L-1)  # Pr(Y=2)
p   <- cbind(1-p12, p12-p2, p2)   # individual class probabilites
# Cumulative probabilities:
cp  <- matrix(cumsum(t(p)) - rep(0:(n-1), rep(3,n)), byrow=TRUE, ncol=3)
# simulate multinomial with varying probs:
y   <- (cp < runif(n)) \%*\% rep(1,3)   
# Thanks to Dave Krantz for this trick
f <- lrm(y ~ age + blood.pressure, x=TRUE, y=TRUE)
par(mfrow=c(2,2))
resid(f, 'score.binary',   pl=TRUE)              #plot score residuals
resid(f, 'partial', pl=TRUE)                     #plot partial residuals
resid(f, 'gof')           #test GOF for each level separately




# Make a series of binary fits and draw 2 partial residual plots
#
f1 <- lrm(y>=1 ~ age + blood.pressure, x=TRUE, y=TRUE)
f2  <- update(f1, y==2 ~.)
par(mfrow=c(2,1))
plot.lrm.partial(f1, f2)




# Simulate data from both a proportional odds and a non-proportional
# odds population model.  Check how 3 kinds of residuals detect
# non-prop. odds
set.seed(71)
n <- 400
x <- rnorm(n)


par(mfrow=c(2,3))
for(j in 1:2) {     # 1: prop.odds   2: non-prop. odds
  if(j==1) 
    L <- matrix(c(1.4,.4,-.1,-.5,-.9),nrow=n,ncol=5,byrow=TRUE) + x/2 else {
	  # Slopes and intercepts for cutoffs of 1:5 :
	  slopes <- c(.7,.5,.3,.3,0)
	  ints   <- c(2.5,1.2,0,-1.2,-2.5)
      L <- matrix(ints,nrow=n,ncol=5,byrow=TRUE)+
           matrix(slopes,nrow=n,ncol=5,byrow=TRUE)*x
	}
  p <- plogis(L)
  if(!.R.) dim(p) <- dim(L)
  # Cell probabilities
  p <- cbind(1-p[,1],p[,1]-p[,2],p[,2]-p[,3],p[,3]-p[,4],p[,4]-p[,5],p[,5])
  # Cumulative probabilities from left to right
  cp  <- matrix(cumsum(t(p)) - rep(0:(n-1), rep(6,n)), byrow=TRUE, ncol=6)
  y   <- (cp < runif(n)) \%*\% rep(1,6)


  f <- lrm(y ~ x, x=TRUE, y=TRUE)
  for(cutoff in 1:5)print(lrm(y>=cutoff ~ x)$coef)


  print(resid(f,'gof'))
  resid(f, 'score', pl=TRUE)
  # Note that full ordinal model score residuals exhibit a
  # U-shaped pattern even under prop. odds
  ti <- if(j==2) 'Non-Proportional Odds\nSlopes=.7 .5 .3 .3 0' else
    'True Proportional Odds\nOrdinal Model Score Residuals'
  title(ti)
  resid(f, 'score.binary', pl=TRUE)
  if(j==1) ti <- 'True Proportional Odds\nBinary Score Residuals'
  title(ti)
  resid(f, 'partial', pl=TRUE)
  if(j==1) ti <- 'True Proportional Odds\nPartial Residuals'
  title(ti)
}
par(mfrow=c(1,1))


# Get data used in Hosmer et al. paper and reproduce their calculations
if(FALSE && .R.) {
v <- Cs(id, low, age, lwt, race, smoke, ptl, ht, ui, ftv, bwt)
d <- read.table("http://www-unix.oit.umass.edu/~statdata/data/lowbwt.dat",
                skip=6, col.names=v)
d <- upData(d, race=factor(race,1:3,c('white','black','other')))
f <- lrm(low ~ age + lwt + race + smoke, data=d, x=TRUE,y=TRUE)
f
resid(f, 'gof')
# Their Table 7 Line 2 found sum of squared errors=36.91, expected
# value under H0=36.45, variance=.065, P=.071
# We got 36.90, 36.45, SD=.26055 (var=.068), P=.085
# Note that two logistic regression coefficients differed a bit
# from their Table 1
}
}
\keyword{models}
\keyword{regression}
\concept{logistic regression model}
\concept{model validation}

\eof
\name{residuals.ols}
\alias{residuals.ols}
\title{
Residuals for ols
}
\description{Computes various residuals and measures of influence for a
  fit from \code{ols}.}
\usage{
\method{residuals}{ols}(object, 
      type=c("ordinary", "score", "dfbeta", "dfbetas", 
             "dffit", "dffits", "hat", "hscore"), \dots)
}
\arguments{
\item{object}{
object created by \code{ols}.  Depending on \code{type}, you may have had to
specify \code{x=TRUE} to \code{ols}.
}
\item{type}{
type of residual desired.  \code{"ordinary"} refers to the usual residual.
\code{"score"} is the matrix of score residuals (contributions to first
derivative of log likelihood).
\code{dfbeta} and \code{dfbetas} mean respectively the raw and normalized matrix 
of changes in regression coefficients after
deleting in turn each observation.  The coefficients are normalized by their
standard errors.  \code{hat} contains the leverages --- diagonals of the ``hat'' matrix.
\code{dffit} and \code{dffits} contain respectively the difference and normalized
difference in predicted values when each observation is omitted. 
The S \code{lm.influence} function is used.  When \code{type="hscore"}, the
ordinary residuals are divided by one minus the corresponding hat
matrix diagonal element to make residuals have equal variance.
}
\item{\dots}{ignored}
}
\value{
a matrix or vector, with places for observations that were originally
deleted by \code{ols} held by \code{NA}s 
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{lm.influence}}, \code{\link{ols}}, \code{\link{which.influence}}, \code{\link[Hmisc]{naresid}}
}
\examples{
set.seed(1)
x1 <- rnorm(100)
x2 <- rnorm(100)
x1[1] <- 100
y <- x1 + x2 + rnorm(100)
f <- ols(y ~ x1 + x2, x=TRUE, y=TRUE)
resid(f, "dfbetas")
which.influence(f)
}
\keyword{models}
\keyword{regression}
\concept{model validation}

\eof
\name{rm.impute}
\alias{rm.impute}
\alias{pbind}
\title{
Imputation of Repeated Measures
}
\description{
NOTE: This function is under development and is not correct at present.
Uses the method of Lavori, Dawson, and Shera (1995) to analyze
uniformly (over subjects) collected repeated measurement data subject
to non-random dropout.  Separately for each imputation and for each
time period, a binary logistic model is developed (using the Design
\code{lrm} function) to predict the probability that each subject remains
in the study at that time period.  Predictors for the first time
period are those listed in the \code{pformula} formula.  These are assumed
to be baseline variables that are never missing.  For later time
periods, predictors include the baseline predictors plus the matrix of
response (\code{y}) values for all earlier periods.  These "previous
responses" will have missing values imputed from the earlier steps.


Missing responses for time period i are imputed, for one of the
\code{n.impute} multiple imputations, as follows.  The period i fitted
propensity model described above is evaluated to obtain the predicted
probability that each subject remained in the study until at least
period i.  The estimated propensity is divided into \code{g} quantile
groups.  If for period i within a propensity quantile group there are
\code{a} subjects still in the study and \code{b} subjects who have dropped out,
Rubin's approximate Bayesian bootstrap is used to estimate the
predictive distribution of the response values for the \code{b} dropouts,
given that the propensity for remaining in the study is approximately
constant for all subjects (dropouts and non-dropouts) in the group.  A
sample of size \code{a} is selected with replacement from the \code{a} subjects
still in the study from the propensity group.  Then a sample of size
\code{b} with replacement is selected from this sample of size \code{a}.  These
\code{b} responses are used to fill-in the responses for the \code{b} dropouts
in the quantile group for the current imputation and current time
period.


If the right-hand-side of a formula is specified for a univariate
response summary (which may be the last response, mean, or area under
the time-response curve), \code{rm.impute} goes on to fit \code{rformula} to
this response summary for each of the multiple imputations using a
fitting function \code{fitter}.  After all \code{n.impute} imputations have been
done, the average "apparent" covariance matrix and the
between-imputation covariance matrix are computed to derive Rubin's
multiple-imputation-corrected covariance matrix for the average of
\code{n.impute} sets of regression coefficients.  See \code{fit.mult.impute} for
more details.


The response variable \code{y} may be an array to handle multiple responses
at each time period.  This array has number of rows equal to the
number of subjects, number of columns equal to the number of periods,
and number of "pages" equal to the number of different response
measurements.  A utility function \code{pbind} is supplied for creating
such arrays from a series of matrices.  When multiple responses are
present, all responses are used in the current propensity model, and
the \code{which}, \code{nk}, \code{rinteraction}, and \code{rint.with} arguments will
apply equally to all responses.
}
\usage{
rm.impute(pformula, y, last, 
          rformula, fitter=ols, which=c("last", "mean", "auc"), 
          data=sys.parent(1), n.impute=10, g=5, 
          nk=0, rinteraction, rint.with=c('all','recent'),
          pr=FALSE, pra=FALSE, npr, 
          keep.prop=FALSE, keep.pfits=FALSE)


pbind(\dots)
}
\arguments{
\item{pformula}{
right-hand-side propensity formula, e.g., ~ treatment + x1 + x2 +
x3*x4.  This formula (as well as \code{rformula} if \code{fitter} is one of the Design
library fitting functions) can contain any of the Design library's
transformation functions such as \code{rcs}, \code{pol}, etc.
}
\item{y}{
matrix of responses over time periods.  To use \code{which="auc"}, column
names of \code{y} must contain numeric measurement times.
}
\item{last}{
an integer vector whose value for the jth subject is the last period
before the subject dropped out.  A subject who never had a follow-up
response measured will have \code{last=0}.
}
\item{...}{
a series of matrices to "page bind" into an array.  New names may be
supplied using the notation \code{pbind(newname1=y1,newname2=y2)}.  The
\code{dimnames} of the first argument (which will be converted to a matrix
if it is a vector, for the unusual one-period case) will be used as
the first two \code{dimnames} of the resulting array, and the names of the
matrices will form the third vector of \code{dimnames}.
}
\item{rformula}{
right-hand-side response formula, e.g., ~ x1 + pol(x2) + treatment.
If omitted, \code{rm.impute} will return only the multiple response imputations.
}
\item{fitter}{
any S-Plus or Design library fitting function for a univariate
response summary.  The default is \code{ols}.  If there are multiple
response variables at each time period and you want to use a different
fitter for different response variables, specify a list of \code{nr}
fitting functions as this argument, where \code{nr} is the number of
response variables.
}
\item{which}{
which response summary is used if \code{rformula} is given.  The default is
the last column of the response matrix.
}
\item{data}{
usually a data frame, if the variables in \code{pformula} and \code{rformula}
are not already available via \code{attach()}
}
\item{n.impute}{
number of imputations.  The more missing data, the higher \code{n.impute}
should be.
}
\item{g}{
number of propensity quantile groups
}
\item{nk}{
number of knots to use in expanding each previous response into a
restricted cubic spline in the propensity model.  Default is zero
(assume linearity).
}
\item{rinteraction}{
a character vector specifying the names of baseline variables that
should be interacted with each response in the propensity model.
Default is no such interactions.
}
\item{rint.with}{
set to \code{"recent"} to allow the variables in \code{rinteraction} to only
interact with the response for the most recent time period, and not with
the most recent and all previous responses (the default)
}
\item{pr}{
set to \code{TRUE} to print each logistic propensity model fit.
}
\item{pra}{
if \code{pr=TRUE}, you can also set \code{pra=TRUE} to print the Design \code{anova()}
results for each propensity model fit.
}
\item{npr}{
if \code{pr=TRUE}, printing will be done for the first \code{npr} imputations
}
\item{keep.prop}{
set to \code{TRUE} to store the array \code{propensity} in the returned list.  The
dimensions for \code{propensity} are the same as \code{Y}.
}
\item{keep.pfits}{
set to \code{TRUE} to store all propensity model fits from \code{lrm} in the result
returned by \code{rm.impute}
}}
\value{
a list with elements \code{Y} and optionally \code{fit} (if \code{rformula} is given)
and \code{propensity} (if \code{keep.prop=TRUE}).  \code{Y} and \code{propensity} are arrays
whose last dimension
corresponds to the multiple imputations and whose first two dimensions
correspond to \code{y}.  \code{Y} is the multiply-imputed response
array and \code{fit} is the imputation-corrected fit object.  Note: Aside
from the regression coefficient vector and covariance matrix, this fit
object will have parameters from the fit of the response summary for
the last imputation.  If \code{keep.pfits=TRUE}, the returned list will also
have an array of propensity fit objects (\code{lrm} objects) for all
response periods and imputations.  If there is more than one response
variable at each time period, \code{fit} will be a list of \code{nr} fit objects
for \code{nr} response variables.
}
\section{Side Effects}{
prints, and creates variables such as y.1, y.2, \dots and in.period.i in
the session database (frame 0)
}
\details{
The algorithm used here will not correct for non-random dropout due to
variables that are not included in the propensity model.  A worst-case
would be having dropouts at period i due to unmeasured responses at
period i.


Ironically, there must be a sufficient number of dropouts for the
propensity score method to work, as the propensity models must have
adequate numbers of dropouts and non-dropouts at each time period.
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University School of Medicine
\cr
f.harrell@vanderbilt.edu
\cr
Much valuable input was received from Chris Barker (Roche
Pharmaceuticals) and Phil Lavori (Stanford University).
}
\references{
Lavori PW, Dawson R, Shera, D: A multiple imputation strategy for
clinical trials with truncation of patient data.  Stat in Med
14:1913--1925, 1995. 


Rubin D, Shenker N: Multiple imputation in health-care data bases: An
overview and some applications.  Stat in Med 10:585--598, 1991.


Engels JM, Diehr P: Imputation of missing longitudinal data: a
comparison of methods.  J Clin Epi 56:968--976, 2003.
}
\seealso{
\code{\link[Hmisc]{transcan}}, \code{\link[Hmisc]{fit.mult.impute}}, \code{\link{lrm}}, \code{\link[Hmisc]{rm.boot}}, \code{\link[Hmisc]{reShape}}
}
\examples{
\dontrun{
# Generate multiple imputes of the response matrix for later use
Y <- rm.impute(~treatment + pol(age,2)*sex, responses, 
               last=lastvisit, data=mydata)$Y
# Do some analysis for each imputation
fits <- vector('list',10)
for(i in 1:10) {
  y <- Y[,,i]
  fits[[i]] <- my.analysis(X,y)
}


# Function to generate a 4-variate equal correlation pattern response
# with missing-at-random responses; missingness is a function of x and
# previous responses.
#
# pna is a function that computes the probability that a subject
# drops out at the current visit.  For visit 1 pna is a function
# of treatment and baseline covariable x.  For visits > 1 pna is
# a function of the matrix of responses for all previous visits.
#
# If second=TRUE we also generate a second response variable having
# NAs in the same positions as this first one.  y2 is generated
# so that its NAs are completely unrelated to any y2 values if
# y2B.effect=0, as the pna function is only given the first 
# response variable.
# y2 is N(0,1) for treat='A' and N(y2.treat.effect,1) for treat='B'.


testdf <- function(n=1500, seed=7, pna, second=FALSE, y2.treat.effect=0) {


set.seed(seed)
treat <- sample(c('A','B'),n,TRUE)
x <- runif(n)
nt <- 4


mvrnorm <- function(n, p = 1, u = rep(0, p), S = diag(p)) {
  Z <- matrix(rnorm(n * p), p, n)
  t(u + t(chol(S)) \%*\% Z)
}


# Generate multivariate normal errors for n subjects at nt times
# Assume equal correlations of rho=.5, independent subjects


rho   <- .5
y <- mvrnorm(n, p=nt, S=diag(rep(1-rho,nt))+rho)


y[treat=='B',] <- y[treat=='B',] + 1


cat('\n\nTreatment-specific means for last period in response variable 1 before generating NAs:\n')
print(tapply(y[,4], treat, mean, na.rm=TRUE))


y[runif(n) < pna(treat, x), 1] <- NA
y[is.na(y[,1]) | runif(n) < pna(treat, x, y[,1]),   2] <- NA
y[is.na(y[,2]) | runif(n) < pna(treat, x, y[,1:2]), 3] <- NA
y[is.na(y[,3]) | runif(n) < pna(treat, x, y[,1:3]), 4] <- NA


last <- rep(4, n)
last[is.na(y[,4])] <- 3
last[is.na(y[,3])] <- 2
last[is.na(y[,2])] <- 1
last[is.na(y[,1])] <- 0


cat('\nNumber of NAs for each time period:\n')
print(apply(y, 2, function(x)sum(is.na(x))))
cat('\n\nTreatment-specific means for last period in response variable 1 after excluding NAs:\n')
print(tapply(y[,4], treat, mean, na.rm=TRUE))
cat('\n\nNaive complete-case analysis:\n\n')
prn(ols(y[,4] ~ pol(x,2) + treat))


if(second) {
  y2 <- matrix(rnorm(n*4),ncol=4)
  y2[treat=='B',] <- y2[treat=='B',] + y2.treat.effect
  cat('\n\nTreatment-specific means for last period in response variable 2 before generating NAs:\n')
  print(tapply(y2[,4], treat, mean, na.rm=TRUE))


  y2[is.na(y[,1]),1] <- NA
  y2[is.na(y[,2]),2] <- NA
  y2[is.na(y[,3]),3] <- NA
  y2[is.na(y[,4]),4] <- NA
  cat('\n\nTreatment-specific means for last period in response variable 2 after excluding NAs:\n')
  print(tapply(y2[,4], treat, mean, na.rm=TRUE))


  y <- pbind(y1=y, y2=y2)
}


list(x=x, treat=treat, y=y, last=last)
}


pna <- function(treat, x, yprev) {
# In this model for the probability of dropout just before the
# current visit, the probability does not depend on the baseline
# covariable x.  For treat='B' the probability of dropout is a
# constant 0.1.  For treat='A' it is a curtailed quadratic
# function of the previous visit's response.
#
# If no previous responses available, we are at first follow-up visit


if(missing(yprev)) 0 else {
  if(is.matrix(yprev)) yprev <- yprev[,ncol(yprev)]
  ifelse(treat=='B', .1,
         pmax(0, pmin(1, .124 +.0835*yprev + .020868*yprev^2)))
  }
}


df <- testdf(pna = pna, second=TRUE)


g <- rm.impute(~ pol(x,2) + treat, df$y, last=df$last, 
               rformula=~ pol(x,2) + treat,
               n.impute=10, g=4, nk=3, 
               rinteraction='treat', rint.with='all',
               pr=TRUE, pra=TRUE, data=df, keep.prop=TRUE, keep.pfits=TRUE)
# Base propensity model is in.study ~ pol(x,2) + treat
# for visits 2,3,4, filled-in y's from previous visits will also be
# used as predictors, and these interact with treat.  
# Restricted cubic spline with 3 knots is assumed for the propensity models
# To fit the multiply-imputed last (4th) response an additive model
# in quadratic x and treat is used


g$fit[[1]]       # shows response fit for first response variable
                 # (y1), with variances adj. for imputation
page(g$Y)        # show all 10 imputations for both responses x 4 periods


# Check for the first imputation how well propensity matching achieved 
# balance in baseline and period 3 filled-in responses for
# dropouts and non-dropouts.  For continuous variables show ECDFs
# using the Hmisc ecdf function, for first 4 imputations.  Do this
# with and without stratifying on quintiles of propensity, and also
# show the estimated 3rd period response  vs. propensity stratified 
# by dropout status.  Use only first response (y1) for all of this.


for(imp in 1:4) {
  y3     <- g$Y[,3,1,imp]
  prop3  <- g$propensity[,3,imp]
  prop3g <- cut2(prop3,g=5)
  ti <- paste('Imputation',imp)
  print(ecdf(~ y3, groups=df$last >= 3, subset=unclass(prop3g)<5))
  title(ti)
  print(ecdf(~ y3 | prop3g, groups=df$last >= 3, 
             subset=unclass(prop3g)<5))
  # Not enough dropouts in highest quintile of propensity completing
  # visit 3
  title(ti)
  plsmo(prop3, y3, group=df$last >= 3, datadensity=TRUE, col=1:2)
  title(ti)
}


# Examine propensity fit for sixth imputation, 4th response
f <- g$pfits[4,6][[1]]
dfr <- as.data.frame(df)
# Edit names of dfr so that responses called y.1, y.2, etc.
# For this example, these are already OK
dd <- datadist(dfr)
options(datadist='dd')   
# datadist makes plot below work without specifying variable settings
plot(f, y.3=NA, treat=NA, conf.int=FALSE)


# Analyze multiple response variables.  Both systolic.bp and
# diastolic.bp are matrices (columns = time periods)


f <- rm.impute(~treatment + pol(age,2)*sex,
               pbind(systolic.bp, diastolic.bp),
               last=lastvisit, data=mydata)


# To deal with a continuous and a binary endpoint you can specify
# pbind(sysbolic.bp, stroke), fitter=list(ols, lrm)
}
}
\keyword{regression}
\keyword{htest}
\keyword{multivariate}
\keyword{array}
\concept{bootstrap}
\concept{repeated measures}
\concept{longitudinal data}

\eof
\name{robcov}
\alias{robcov}
\title{
Robust Covariance Matrix Estimates
}
\description{
Uses the Huber-White method to adjust the variance-covariance matrix of
a fit from maximum likelihood or least squares, to correct for
heteroscedasticity and for correlated responses from cluster samples.
The method uses the ordinary estimates of regression coefficients and
other parameters of the model, but involves correcting the covariance
matrix for model misspecification and sampling design. 
Models currently implemented are models that have a 
\code{residuals(fit,type="score")} function implemented, such as \code{lrm}, 
\code{cph}, \code{coxph}, and ordinary linear models (\code{ols}).
The fit must have specified the \code{x=TRUE} and \code{y=TRUE} options for certain models.
Observations in different clusters are assumed to be independent.
For the special case where every cluster contains one observation, the
corrected covariance matrix returned is the "sandwich" estimator
(see Lin and Wei). This is a consistent estimate of the covariance matrix
even if the model is misspecified (e.g. heteroscedasticity, underdispersion,
wrong covariate form).


For the special case of ols fits, \code{robcov} can compute the improved
(especially for small samples) Efron estimator that adjusts for
natural heterogeneity of residuals (see Long and Ervin (2000)
estimator HC3).
}
\usage{
robcov(fit, cluster, method=c('huber','efron'))
}
\arguments{
\item{fit}{
a fit object from the \code{Design()} series
}
\item{cluster}{
a variable indicating groupings. \code{cluster} may be any type of vector
(factor, character, integer).  NAs are not allowed.
Unique values of \code{cluster} indicate
possibly correlated groupings of observations. Note the data used in
the fit and stored in \code{fit$x} and \code{fit$y} may have had observations
containing missing values deleted. It is assumed that if any NAs were
removed during the original model fitting, an \code{naresid} function
exists to restore NAs so that the rows of the score matrix coincide
with \code{cluster}.
If \code{cluster} is omitted,
it defaults to the integers 1,2,\dots,n to obtain the "sandwich" robust
covariance matrix estimate.
}
\item{method}{
can set to \code{"efron"} for ols fits (only).  Default is Huber-White
estimator of the covariance matrix.
}}
\value{
a new fit object with the same class as the original fit,
and with the element \code{orig.var} added. \code{orig.var} is
the covariance matrix of the original fit.  Also, the original \code{var}
component is replaced with the new Huberized estimates.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Huber, PJ. Proc Fifth Berkeley Symposium Math Stat 1:221--33, 1967.


White, H. Econometrica 50:1--25, 1982.


Lin, DY, Wei, LJ. JASA 84:1074--8, 1989.


Rogers, W.  Stata Technical Bulletin STB-8, p. 15--17, 1992.


Rogers, W.  Stata Release 3 Manual, \code{deff}, \code{loneway}, \code{huber}, \code{hreg}, \code{hlogit}
functions.


Long, JS, Ervin, LH.  The American Statistician 54:217--224, 2000.
}
\section{Warnings}{
Adjusted \code{ols} fits do not have the corrected standard errors printed with
\code{print.ols}. Use \code{sqrt(diag(adjfit$var))} to get this, where
\code{adjfit} is the result of \code{robcov}.
}
\seealso{
\code{\link{bootcov}}, \code{\link{naresid}}, \code{\link{residuals.cph}}
}
\examples{
# A dataset contains a variable number of observations per subject,
# and all observations are laid out in separate rows. The responses
# represent whether or not a given segment of the coronary arteries
# is occluded. Segments of arteries may not operate independently
# in the same patient.  We assume a "working independence model" to
# get estimates of the coefficients, i.e., that estimates assuming
# independence are reasonably efficient.  The job is then to get
# unbiased estimates of variances and covariances of these estimates.


set.seed(1)
n.subjects <- 30
ages <- rnorm(n.subjects, 50, 15)
sexes  <- factor(sample(c('female','male'), n.subjects, TRUE))
logit <- (ages-50)/5
prob <- plogis(logit)  # true prob not related to sex
id <- sample(1:n.subjects, 300, TRUE) # subjects sampled multiple times
table(table(id))  # frequencies of number of obs/subject
age <- ages[id]
sex <- sexes[id]
# In truth, observations within subject are independent:
y   <- ifelse(runif(300) <= prob[id], 1, 0)
f <- lrm(y ~ lsp(age,50)*sex, x=TRUE, y=TRUE)
g <- robcov(f, id)
diag(g$var)/diag(f$var)
# add ,group=w to re-sample from within each level of w
anova(g)            # cluster-adjusted Wald statistics
# fastbw(g)         # cluster-adjusted backward elimination
plot(g, age=30:70, sex='female')  # cluster-adjusted confidence bands


# Get design effects based on inflation of the variances when compared
# with bootstrap estimates which ignore clustering
g2 <- robcov(f)
diag(g$var)/diag(g2$var)


# Get design effects based on pooled tests of factors in model
anova(g2)[,1] / anova(g)[,1]




# A dataset contains one observation per subject, but there may be
# heteroscedasticity or other model misspecification. Obtain
# the robust sandwich estimator of the covariance matrix.


# f <- ols(y ~ pol(age,3), x=TRUE, y=TRUE)
# f.adj <- robcov(f)
}
\keyword{models}
\keyword{regression}
\keyword{robust}
\concept{cluster sampling}
\concept{intra-class correlation}

\eof
\name{sensuc}
\alias{sensuc}
\alias{plot.sensuc}
\title{
Sensitivity to Unmeasured Covariables
}
\description{
Performs an analysis of the sensitivity of a binary treatment (\eqn{X}) effect to an
unmeasured binary confounder (\eqn{U}) for a fitted binary logistic or an
unstratified non-time-dependent Cox survival model (the function works
well for the former, not so well for the latter).  This is done by
fitting a sequence of models with separately created \eqn{U} variables
added to the original model.  The sequence of models is formed 
by simultaneously varying \eqn{a} and \eqn{b},
where \eqn{a} measures the association between \eqn{U} and \eqn{X} and \eqn{b}
measures the association between \eqn{U} and \eqn{Y}, where \eqn{Y} is the outcome
of interest.  For Cox models, an
approximate solution is used by letting \eqn{Y} represent some binary
classification of the event/censoring time and the event indicator.
For example, \eqn{Y} could be just be the event indicator, ignoring time of
the event or censoring, or it could be \eqn{1} if a subject failed before
one year and \eqn{0} otherwise.
When for each combination of \eqn{a}
and \eqn{b} the vector of binary values \eqn{U} is generated, one of two
methods is used to constrain the properties of \eqn{U}.  With either
method, the overall prevalence of \eqn{U} is constrained to be \code{prev.u}.
With the default
method (\code{or.method="x:u y:u"}), \eqn{U} is sampled so that the \eqn{X:U} odds
ratio is \eqn{a} and the \eqn{Y:U} odds ratio is \eqn{b}.  With the second method,
\eqn{U} is sampled according to the model 
\eqn{logit(U=1 | X, Y) = \alpha + \beta*Y + \gamma*X}, where
\eqn{\beta=\log(b)} and \eqn{\gamma=\log(a)} and \eqn{\alpha} is
determined so that the prevalence of \eqn{U=1} is \code{prev.u}.  This
second method results in the adjusted odds ratio for \eqn{Y:U} given
\eqn{X} being \eqn{b} whereas the default method forces the
unconditional (marginal) \eqn{Y:U} odds ratio to be \eqn{b}.  Rosenbaum
uses the default method.

There is a \code{plot} method for plotting objects created by \code{sensuc}.
Values of \eqn{a} are placed on the x-axis and observed marginal odds or
hazards ratios for \eqn{U} (unadjusted ratios) appear on the y-axis.  For
Cox models, the hazard ratios will not agree exactly with \eqn{X}:event
indicator odds ratios but they sometimes be made close through judicious choice
of the \code{event} function.  The default plot
uses four symbols which differentiate whether for the \eqn{a,b}
combination the effect of \eqn{X} adjusted for \eqn{U} (and for any other
covariables that were in the original model fit) is positive
(usually meaning an effect ratio greater than 1) and "significant",
merely positive, not positive and non significant, or not positive but
significant.  There is also an
option to draw the numeric value
of the \eqn{X} effect ratio at the \eqn{a},\eqn{b} combination along
with its \eqn{Z} statistic underneath in smaller letters, and an option
to draw the effect ratio in one of four colors depending on the
significance of the \eqn{Z} statistic.
}
\usage{
# fit <- lrm(formula=y ~ x + other.predictors, x=TRUE, y=TRUE)  #or
# fit <- cph(formula=Surv(event.time,event.indicator) ~ x + other.predictors,
#            x=TRUE, y=TRUE)

sensuc(fit,  
       or.xu=seq(1, 6, by = 0.5), or.u=or.xu, 
       prev.u=0.5, constrain.binary.sample=TRUE, 
       or.method=c("x:u y:u","u|x,y"),
       event=function(y) if(is.matrix(y))y[,ncol(y)] else 1*y)

\method{plot}{sensuc}(x,  ylim=c((1+trunc(min(x$effect.u)-.01))/
                   ifelse(type=='numbers',2,1),
                   1+trunc(max(x$effect.u)-.01)),
     xlab='Odds Ratio for X:U',
     ylab=if(x$type=='lrm')'Odds Ratio for Y:U' else
          'Hazard Ratio for Y:U',
     digits=2, cex.effect=.75, cex.z=.6*cex.effect,
     delta=diff(par('usr')[3:4])/40, 
     type=c('symbols','numbers','colors'),
     pch=c(15,18,5,0), col=c(2,3,1,4), alpha=.05,
     impressive.effect=function(x)x > 1,\dots)
}
\arguments{
\item{fit}{
result of \code{lrm} or \code{cph} with \code{x=TRUE, y=TRUE}.  The
first variable in the right hand side of the model formula must have
been the binary \eqn{X} variable, and it may not interact with other
predictors.
}
\item{x}{
result of \code{sensuc}
}
\item{or.xu}{
vector of possible odds ratios measuring the \eqn{X:U} association.
}
\item{or.u}{
vector of possible odds ratios measuring the \eqn{Y:U} association.
Default is \code{or.xu}.
}
\item{prev.u}{
desired prevalence of \eqn{U=1}.  Default is 0.5, which is usually a
"worst case" for sensitivity analyses.
}
\item{constrain.binary.sample}{
By default, the binary \eqn{U} values are sampled from the appropriate
distributions conditional on \eqn{Y} and \eqn{X} so that the proportions of
\eqn{U=1} in each sample are exactly the desired probabilities, to within
the closeness of \eqn{n\times}probability to an integer.  Specify
\code{constrain.binary.sample=FALSE} to sample from ordinary Bernoulli
distributions, to allow proportions of \eqn{U=1} to reflect sampling fluctuations.
}
\item{or.method}{
see above
}
\item{event}{
a function classifying the response variable into a binary event for the
purposes of constraining the association between \eqn{U} and \eqn{Y}.
For binary logistic models, \code{event} is left at its default value, which
is the identify function, i.e, the original \eqn{Y} values are taken as the
events (no other choice makes any sense here).  For Cox models, the 
default \code{event} function takes the last column of the \code{Surv} object
stored with the fit.  For rare events (high proportion of censored
observations), odds ratios approximate hazard ratios, so the default is OK.  
For other cases, the survival times should be considered (probably in
conjunction with the event indicators), although it may not be possible
to get a high enough hazard ratio between \eqn{U} and \eqn{Y} by sampling \eqn{U} by
temporarily making \eqn{Y} binary.  See the last example which is
for a 2-column \code{Surv} object (first column of response variable=event time, 
second=event indicator).  When
dichotomizing survival time at a given point, it is advantageous to choose
the cutpoint so that not many censored survival times preceed the cutpoint.
Note that in fitting Cox models to examine sensitivity to \eqn{U}, the original
non-dichotomized failure times are used.
}
\item{ylim}{
y-axis limits for \code{plot}
}
\item{xlab}{
x-axis label
}
\item{ylab}{
y-axis label
}
\item{digits}{
number of digits to the right of the decimal point for drawing numbers
on the plot, for
\code{type="numbers"} or \code{type="colors"}.
}
\item{cex.effect}{
character size for drawing effect ratios
}
\item{cex.z}{
character size for drawing \eqn{Z} statistics
}
\item{delta}{
decrement in \eqn{y} value used to draw \eqn{Z} values below effect ratios
}
\item{type}{
specify \code{"symbols"} (the default), \code{"numbers"}, or \code{"colors"} (see above)
}
\item{pch}{
4 plotting characters corresponding to positive and significant
effects for \eqn{X}, positive and non-significant effects, not positive and
not significant, not positive but significant
}
\item{col}{
4 colors as for \code{pch}
}
\item{alpha}{
significance level
}
\item{impressive.effect}{
a function of the odds or hazard ratio for \eqn{X} returning \code{TRUE} for a
positive effect.  By default, a positive effect is taken to mean a
ratio exceeding one.
}
\item{...}{
optional arguments passed to \code{plot}
}}
\value{
\code{sensuc} returns an object of class \code{"sensuc"} with the following elements: \code{OR.xu}
(vector of desired \eqn{X:U} odds ratios or \eqn{a} values), \code{OOR.xu}
(observed marginal \eqn{X:U} odds ratios), \code{OR.u} (desired \eqn{Y:U} odds
ratios or \eqn{b} values), \code{effect.x} (adjusted odds or hazards ratio for
\eqn{X} in a model adjusted for \eqn{U} and all of the other predictors),
\code{effect.u} (unadjusted \eqn{Y:U} odds or hazards ratios), \code{effect.u.adj}
(adjusted \eqn{Y:U} odds or hazards ratios), \eqn{Z} (Z-statistics), \code{prev.u}
(input to \code{sensuc}), \code{cond.prev.u} (matrix with one row per \eqn{a},\eqn{b}
combination, specifying prevalences of \eqn{U} conditional on \eqn{Y} and \eqn{X}
combinations), and \code{type} (\code{"lrm"} or \code{"cph"}).
}
\author{
Frank Harrell\cr
Mark Conaway\cr
Department of Biostatistics\cr
Vanderbilt University School of Medicine\cr
f.harrell@vanderbilt.edu, mconaway@virginia.edu
}
\references{
Rosenbaum, Paul R (1995): Observational Studies.  New York: Springer-Verlag.


Rosenbaum P, Rubin D (1983): Assessing sensitivity to an unobserved binary
covariate in an observational study with binary outcome.  J Roy Statist Soc
B 45:212--218.
}
\seealso{
\code{\link{lrm}}, \code{\link{cph}}, \code{\link{sample}}
}
\examples{
set.seed(17)
x <- sample(0:1, 500,TRUE)
y <- sample(0:1, 500,TRUE)
y[1:100] <- x[1:100]  # induce an association between x and y
x2 <- rnorm(500)


f <- lrm(y ~ x + x2, x=TRUE, y=TRUE)


#Note: in absence of U odds ratio for x is exp(2nd coefficient)


g <- sensuc(f, c(1,3))


# Note: If the generated sample of U was typical, the odds ratio for
# x dropped had U been known, where U had an odds ratio
# with x of 3 and an odds ratio with y of 3


plot(g)


# Fit a Cox model and check sensitivity to an unmeasured confounder


# f <- cph(Surv(d.time,death) ~ treatment + pol(age,2)*sex, x=TRUE, y=TRUE)
# sensuc(f, event=function(y) y[,2] & y[,1] < 365.25 )
# Event = failed, with event time before 1 year
# Note: Analysis uses f$y which is a 2-column Surv object
}
\keyword{regression}
\keyword{htest}
\keyword{models}
\keyword{survival}
\concept{model validation}
\concept{sampling}
\concept{logistic regression model}
\concept{sensitivity analysis}


\eof
\name{specs.Design}
\alias{specs.Design}
\alias{specs}
\alias{print.specs.Design}
\title{
Design Specifications for Models
}
\description{
Prints the design specifications, e.g., number of parameters for each
factor, levels of categorical factors, knot locations in splines,
pre-transformations, etc. 
}
\usage{
specs(fit, \dots)
\method{specs}{Design}(fit, long=FALSE, \dots)

\method{print}{specs.Design}(x, \dots)
}
\arguments{
\item{fit}{
a fit object created with the \code{Design} library in effect
}
\item{x}{
an object returned by \code{specs}
}
\item{long}{
if  \code{TRUE}, causes the plotting and estimation limits to be printed for
each factor
}
\item{\dots}{ignored}
}
\value{
a list containing information about the fit and the predictors as elements
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{latex.Design}}, \code{\link{datadist}}
}
\examples{
set.seed(1)
blood.pressure <- rnorm(200, 120, 15)
dd <- datadist(blood.pressure)
options(datadist='dd')
L <- .03*(blood.pressure-120)
sick <- ifelse(runif(200) <= plogis(L), 1, 0)
f <- lrm(sick ~ rcs(blood.pressure,5))
specs(f)    # find out where 5 knots are placed
g <- glmD(sick ~ rcs(blood.pressure,5), family=binomial)
specs(g,long=TRUE)
options(datadist=NULL)
}
\keyword{models}
\keyword{regression}
\keyword{methods}

\eof
\name{summary.Design}
\alias{summary.Design}
\alias{print.summary.Design}
\alias{latex.summary.Design}
\alias{plot.summary.Design}
\title{
Summary of Effects in Model
}
\description{
\code{summary.Design} forms a summary of the effects of each
factor.  When \code{summary} is used to estimate odds or hazard ratios for
continuous variables, it allows the levels of interacting factors to be
easily set, as well as allowing the user to choose the interval for the
effect. This method of estimating effects allows for nonlinearity in
the predictor.  Factors requiring multiple parameters are handled, as
\code{summary} obtains predicted values at the needed points and takes
differences.  By default, inter-quartile range effects (odds ratios,
hazards ratios, etc.) are printed for continuous factors, and all
comparisons with the reference level are made for categorical factors.
\code{print.summary.Design} prints the results, \code{latex.summary.Design} typesets
the results, and \code{plot.summary.Design}
plots shaded confidence bars to display the results graphically.
The longest confidence bar on each page is labeled with confidence levels
(unless this bar has been ignored due to \code{clip}).  By default, the following
confidence levels are all shown: .7, .8, .9, .95, and .99, using 
levels of gray scale (colors for Windows).
}
\usage{
\method{summary}{Design}(object, \dots, est.all=TRUE, antilog, conf.int=.95, abbrev=FALSE)

\method{print}{summary.Design}(x, \dots)

\method{latex}{summary.Design}(object, title, \dots)

\method{plot}{summary.Design}(x, at, log=FALSE,
    q=c(0.7, 0.8, 0.9, 0.95, 0.99), xlim, nbar, cex=1, nint=10,
    cex.c=.5, cex.t=1, clip=c(-1e30,1e30), main, \dots)
}
\arguments{
\item{object}{
a \code{Design} fit object.  Either \code{options(datadist)} should have
been set before the fit, or \code{datadist()} and
\code{options(datadist)} run before \code{summary}.  For \code{latex} is
the result of \code{summary}.
}
\item{\dots}{
For \code{summary}, omit list of variables to estimate effects for all
predictors. Use a list 
of variables of the form \code{age=NA, sex=NA} to estimate using default
ranges. Specify \code{age=50} for example to adjust age to 50 when testing
other factors (this will only matter for factors that interact with age).
Specify e.g. \code{age=c(40,60)} to estimate the effect of increasing age from
40 to 60. Specify \code{age=c(40,50,60)} to let age range from 40 to 60 and
be adjusted to 50 when testing other interacting factors. For category
factors, a single value specifies the reference cell and the adjustment value. For
example, if \code{treat} has levels \code{"a", "b"} and \code{"c"} and \code{treat="b"}
is given to \code{summary}, treatment \code{a} will be compared to \code{b}
and \code{c} will be compared to \code{b}. Treatment \code{b} will be used when estimating
the effect of other factors. Category variables can have category labels
listed (in quotes), or an unquoted number that is a legal level, if all levels 
are numeric.  You need only use the first few
letters of each variable name - enough for unique identification.
For variables not defined with \code{datadist}, you must specify 3 values, none
of which are \code{NA}.

Also represents other arguments to pass to \code{latex}, is ignored for
\code{print}, or other optional arguments passed to \code{confbar}.  The
most important of these is \code{q}, the vector of confidence levels,
and \code{col}, which is a vector corresponding to \code{q} specifying
the colors for the regions of the bars.  \code{q} defaults to
\code{c(.7,.8,.9,.95,.99)} and \code{col} to \code{c(1,.8,.5,.2,.065)}
for UNIX, so that lower confidence levels (inner regions of bars)
corresponding with darker shades.  Specify for example \code{col=1:5} to
use actual colors.  For Windows, the default is \code{col=c(1,4,3,2,5)},
which by default represents black, blue, green, red, yellow.
}
\item{est.all}{
Set to \code{FALSE} to only estimate effects of variables listed. Default is \code{TRUE}.
}
\item{antilog}{
Set to \code{FALSE} to suppress printing of anti-logged effects. Default is \code{TRUE}
if the model was fitted by \code{lrm} or \code{cph}.
Antilogged effects will be odds ratios for logistic models and hazard ratios
for proportional hazards models.
}
\item{conf.int}{
Defaults to \code{.95} for \code{95\%} confidence intervals of effects.
}
\item{abbrev}{
Set to \code{TRUE} to use the \code{abbreviate} function to shorten factor levels
for categorical variables in the model.
}
\item{x}{result of \code{summary}}
\item{title}{
\code{title} to pass to \code{latex}.  Default is name of fit object passed to
\code{summary} prefixed with \code{"summary"}.
}

\item{at}{
vector of coordinates at which to put tick mark labels on the main axis.  If
\code{log=TRUE}, \code{at} should be in anti-log units.
}
\item{log}{
Set to \code{TRUE} to plot on \eqn{X\beta}{X beta} scale but labeled with
anti-logs. 
}
\item{q}{scalar or vector of confidence coefficients to depict}
\item{xlim}{
X-axis limits for \code{plot} in units of the linear predictors (log scale
if \code{log=TRUE}).  If \code{at} is specified and \code{xlim} is omitted, \code{xlim} is
derived from the range of \code{at}.
}
\item{nbar}{
Sets up plot to leave room for \code{nbar} horizontal bars.  Default is the
number of non-interaction factors in the model.  Set \code{nbar} to a larger
value to keep too much surrounding space from appearing around horizontal
bars.  If \code{nbar} is smaller than the number of bars, the plot is divided
into multiple pages with up to \code{nbar} bars on each page.
}
\item{cex}{
\code{cex} parameter for factor labels.
}
\item{nint}{
Number of tick mark numbers for \code{pretty}.
}
\item{cex.c}{
\code{cex} parameter for \code{confbar}, for quantile labels.
}
\item{cex.t}{
\code{cex} parameter for main title.  Set to \code{0} to suppress the title.
}
\item{clip}{
confidence limits outside the interval \code{c(clip[1], clip[2])} will be
ignored, and \code{clip} also be respected when computing \code{xlim}
when \code{xlim} is not specified.  \code{clip} should be in the units of
\code{fun(x)}.  If \code{log=TRUE}, \code{clip} should be in \eqn{X\beta}{X
  beta} units. 
}
\item{main}{
main title.  Default is inferred from the model and value of \code{log},
e.g., \code{"log Odds Ratio"}.
}
}
\value{
For \code{summary.Design}, a matrix of class \code{summary.Design} 
with rows corresponding to factors in
the model and columns containing the low and high values for the effects,
the range for the effects, the effect point estimates (difference in
predicted values for high and low factor values), the standard error
of this effect estimate, and the lower and upper confidence limits.
If \code{fit$scale.pred} has a second level, two rows appear for each factor,
the second corresponding to anti--logged effects. Non--categorical factors
are stored first, and effects for any categorical factors are stored at
the end of the returned matrix.  \code{scale.pred} and \code{adjust}.  \code{adjust}
is a character string containing levels of adjustment variables, if
there are any interactions.  Otherwise it is "".
\code{latex.summary.Design} returns an object of class \code{c("latex","file")}.
It requires the \code{latex} function in Hmisc.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{datadist}}, \code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{Design.Misc}},
\code{\link[Hmisc]{confbar}}, \code{\link{pretty}}, \code{\link{contrast.Design}}
}
\examples{
n <- 1000    # define sample size
set.seed(17) # so can reproduce the results
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))
label(age)            <- 'Age'      # label is in Hmisc
label(cholesterol)    <- 'Total Cholesterol'
label(blood.pressure) <- 'Systolic Blood Pressure'
label(sex)            <- 'Sex'
units(cholesterol)    <- 'mg/dl'   # uses units.default in Hmisc
units(blood.pressure) <- 'mmHg'


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


ddist <- datadist(age, blood.pressure, cholesterol, sex)
options(datadist='ddist')


fit <- lrm(y ~ blood.pressure + sex * (age + rcs(cholesterol,4)))


s <- summary(fit)                # Estimate effects using default ranges
                                 # Gets odds ratio for age=3rd quartile
                                 # compared to 1st quartile
\dontrun{
latex(s)                         # Use LaTeX to print nice version
latex(s, file="")                # Just write LaTeX code to screen
}
summary(fit, sex='male', age=60) # Specify ref. cell and adjustment val
summary(fit, age=c(50,70))       # Estimate effect of increasing age from
                                 # 50 to 70
s <- summary(fit, age=c(50,60,70)) 
                                 # Increase age from 50 to 70, adjust to
                                 # 60 when estimating effects of other factors
#Could have omitted datadist if specified 3 values for all non-categorical
#variables (1 value for categorical ones - adjustment level)
plot(s, log=TRUE, at=c(.1,.5,1,1.5,2,4,8))


options(datadist=NULL)
}
\keyword{models}
\keyword{regression}
\keyword{htest}
\keyword{survival}
\keyword{hplot}
\keyword{interface}
\concept{logistic regression model}

\eof
\name{summary.survfit}
\alias{summary.survfit}
\title{Design version of survival Package summary.survfit}}
\description{
This is a modified version of the survival packages's
\code{summary.survfit} function.  It returns a list containing the
survival curve, confidence limits, and other information.
}
\usage{
\method{summary}{survfit}(object, times, censored = FALSE, scale = 1, \dots)
}
\arguments{
  \item{object}{result of \code{survfit}}
  \item{times}{}
  \item{censored}{}
  \item{scale}{}
  \item{\dots}{see \code{\link[survival]{summary.survfit}}}
}
\value{a list with components \code{time,surv,n.risk,n.event,std.err,conf.int,lower,upper,strata,call,na.action}}
}
\seealso{\code{\link[survival]{summary.survfit}}}
\examples{
}
\keyword{survival}

\eof
\name{survest.cph}
\alias{survest}
\alias{survest.cph}
\title{
Cox Survival Estimates
}
\description{
Compute survival probabilities and optional confidence limits for
Cox survival models.  If \code{x=TRUE, y=TRUE} were specified to \code{cph},
confidence limits use the correct formula for any combination of
predictors. Otherwise, if \code{surv=TRUE} was specified to \code{cph},
confidence limits are based only on standard errors of \code{log(-log
  S(t))} at the mean value of \eqn{X\beta}{X beta}. If the model
contained only stratification factors, or if predictions are being
requested near the mean of each covariable, this approximation will be
accurate. Unless \code{times} is given, at most one observation may be
predicted.
}
\usage{
survest(fit, \dots)
\method{survest}{cph}(fit, newdata, linear.predictors, x, times, 
        fun, loglog=FALSE, conf.int=0.95, type, vartype,
        conf.type=c("log-log", "log", "plain", "none"), se.fit=TRUE,
        what=c('survival','parallel'),
        individual=FALSE, ...)
}
\arguments{
\item{fit}{
a model fit from \code{cph}
}
\item{newdata}{
a data frame containing predictor variable combinations for which
predictions are desired
}
\item{linear.predictors}{
a vector of linear predictor values (centered) for which predictions
are desired. If the model is stratified, the "strata" attribute
must be attached to this vector (see example).
}
\item{x}{
 a design matrix at which to compute estimates, with any strata attached
 as a "strata" attribute. Only one of \code{newdata},
 \code{linear.predictors}, or \code{x} may be specified.  If none is
 specified, but \code{times} is specified, you will get survival
 predictions at all subjects' linear predictor and strata values.
}
\item{times}{
a vector of times at which to get predictions. If omitted, predictions
are made at all unique failure times in the original input data.
}
\item{loglog}{
set to \code{TRUE} to make the \code{log-log} transformation of survival estimates
and confidence limits.
}
\item{fun}{
any function to transform the estimates and confidence limits (\code{loglog}
is a special case)
}
\item{conf.int}{
set to \code{FALSE} or \code{0} to suppress confidence limits, or e.g. \code{.95} to 
cause 0.95 confidence limits to be computed
}
\item{type}{
see \code{survfit.coxph}
}
\item{vartype}{
see \code{survfit.coxph}
}
\item{conf.type}{
specifies the basis for computing confidence limits. \code{"log-log"},
the default, is the most natural basis.
}
\item{se.fit}{
set to \code{TRUE} to get standard errors of log-log predicted survival
(no matter what \code{conf.type} is).
If \code{FALSE}, confidence limits are suppressed.
}
\item{individual}{
set to \code{TRUE} to have \code{survfit} interpret \code{newdata} as
specifying a covariable path for a single individual (represented by
multiple records).
}
\item{what}{
Normally use \code{what="survival"} to estimate survival probabilities at
times that may not correspond to the subjects' own times.
\code{what="parallel"} assumes that the length of \code{times} is the number of
subjects (or one), and causes \code{survest} to estimate the ith subject's survival
probability at the ith value of \code{times} (or at the scalar value of \code{times}).
\code{what="parallel"} is used by \code{val.surv} for example.
}
\item{\dots}{unused}
}
\value{
  If \code{times} is omitted, returns a list with the elements
  \code{time}, \code{n.risk}, \code{n.event}, \code{surv}, \code{call}
  (calling statement), and optionally \code{std.err}, \code{upper},
  \code{lower}, \code{conf.type}, \code{conf.int}. The estimates in this
  case correspond to one subject. If \code{times} is specified, the
  returned list has possible components \code{time}, \code{surv},
  \code{std.err}, \code{lower}, and \code{upper}. These will be matrices
  (except for \code{time}) if more than one subject is being predicted,
  with rows representing subjects and columns representing \code{times}.
  If \code{times} has only one time, these are reduced to vectors with
  the number of elements equal to the number of subjects.  }
\details{
  The result is passed through \code{naresid} if \code{newdata},
  \code{linear.predictors}, and \code{x} are not specified, to restore
  placeholders for \code{NA}s.
  }
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{cph}}, \code{\link{survfit.cph}}, \code{\link{survfit.coxph}}, \code{\link{predict.Design}}, \code{\link{survplot}}
}
\examples{
# Simulate data from a population model in which the log hazard
# function is linear in age and there is no age x sex interaction
# Proportional hazards holds for both variables but we
# unnecessarily stratify on sex to see what happens
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n, TRUE))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
dt <- -log(runif(n))/h
label(dt) <- 'Follow-up Time'
e <- ifelse(dt <= cens,1,0)
dt <- pmin(dt, cens)
units(dt) <- "Year"
dd <- datadist(age, sex)
options(datadist='dd')
Srv <- Surv(dt,e)


f <- cph(Srv ~ age*strat(sex), x=TRUE, y=TRUE) #or surv=T
survest(f, expand.grid(age=c(20,40,60),sex=c("Male","Female")),
	    times=c(2,4,6), conf.int=.9)
f <- update(f, surv=TRUE)
lp <- c(0, .5, 1)
f$strata   # check strata names
attr(lp,'strata') <- rep(1,3)  # or rep('sex=Female',3)
survest(f, linear.predictors=lp, times=c(2,4,6))
options(datadist=NULL)
}
\keyword{models}
\keyword{survival}
\keyword{regression}

\eof
\name{survest.psm}
\alias{survest.psm}
\alias{print.survest.psm}
\title{Parametric Survival Estimates}
\description{
Computes predicted survival probabilities or hazards and optionally confidence
limits (for survival only) for parametric survival models fitted with \code{psm}.
If getting predictions for more than one observation, \code{times} must
be specified. For a model without predictors, no input data are
specified.
}
\usage{
\method{survest}{psm}(fit, newdata, linear.predictors, x, times, fun,
        loglog=FALSE, conf.int=0.95,
        what=c("survival","hazard","parallel"), \dots)

\method{print}{survest.psm}(x, \dots)
}
\arguments{
\item{fit}{
fit from \code{psm}
}
\item{newdata, linear.predictors, x, times, conf.int}{
see \code{survest.cph}. One of \code{newdata}, \code{linear.predictors}, \code{x} must be given.
\code{linear.predictors} includes the intercept.
If \code{times} is omitted, predictions are made at 200 equally spaced points
between 0 and the maximum failure/censoring time used to fit the model.

\code{x} can also be a result from \code{survest.psm}.
}
\item{what}{
The default is to compute survival probabilities.  Set \code{what="hazard"} or
some abbreviation of \code{"hazard"} to compute hazard rates.
\code{what="parallel"} assumes that the length of \code{times} is the number of
subjects (or one), and causes \code{survest} to estimate the
\eqn{i^{th}} subject's survival probability at the \eqn{i^{th}} value of
\code{times} (or at the scalar value of \code{times}). 
\code{what="parallel"} is used by \code{val.surv} for example.
}
\item{loglog}{
set to \code{TRUE} to transform survival estimates and confidence limits using
log-log
}
\item{fun}{
a function to transform estimates and optional confidence intervals
}
\item{\dots}{unused}
}
\value{
see \code{survest.cph}. If the model has no predictors, predictions are
made with respect to varying time only, and the returned object
is of class \code{"survfit"} so the survival curve can be plotted
with survplot.survfit. If \code{times} is omitted, the
entire survival curve or hazard from \code{t=0,\dots,fit$maxtime} is estimated, with
increments computed to yield 200 points where \code{fit$maxtime} is the
maximum survival time in the data used in model fitting. Otherwise,
the \code{times} vector controls the time points used.
}
\details{
Confidence intervals are based on asymptotic normality of the linear
predictors.
The intervals account for the fact that a scale parameter may have been
estimated jointly with beta.
}
\author{
Frank Harrell\cr
Department of Biostatistics\cr
Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{psm}}, \code{\link[survival]{survreg}}, \code{\link{Design}}, \code{\link[survival]{survfit}}, \code{\link{predict.Design}}, \code{\link{survplot}},
\code{\link[survival]{survreg.distributions}}
}
\examples{
# Simulate data from a proportional hazards population model
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50))
dt <- -log(runif(n))/h
label(dt) <- 'Follow-up Time'
e <- ifelse(dt <= cens,1,0)
dt <- pmin(dt, cens)
units(dt) <- "Year"
S <- Surv(dt,e)


f <- psm(S ~ lsp(age,c(40,70)))
survest(f, data.frame(age=seq(20,80,by=5)), times=2)


#Get predicted survival curve for 40 year old
survest(f, data.frame(age=40))


#Get hazard function for 40 year old
survest(f, data.frame(age=40), what="hazard")$surv #still called surv
}
\keyword{survival}
\keyword{regression}
\keyword{models}
% Converted by Sd2Rd version 1.21.

\eof
\name{survfit}
\alias{survfit}
\title{Modified Version of survival Package survfit Function}
\description{
This modification of \code{survfit} keeps attributes of the \code{Surv}
object, users \code{Hmisc}'s \code{interaction()} to form strata labels,
and uses a default confidence interval basis of log-log.
}
\usage{
survfit(formula, data, weights, subset, na.action = na.delete, conf.type = c("log-log", "log", "plain", "none"), ...)
}
\arguments{
  \item{formula}{}
  \item{data}{}
  \item{weights}{}
  \item{subset}{}
  \item{na.action}{}
  \item{conf.type}{}
  \item{\dots}{see \code{\link[survival]{survfit}}}
}
\value{see \code{\link[survival]{survfit}}}
\seealso{\code{\link[survival]{survfit}}}
\examples{
\dontrun{
#fit a Kaplan-Meier and print the results
data(aml)
survfit(Surv(time, status) ~ x, data=aml)
}
}
\keyword{survival}


\eof
\name{survfit.cph}
\alias{survfit.cph}
\title{
Cox Predicted Survival
}
\description{
This is a slightly modified version of Therneau's \code{survfit.coxph}
function. The difference is that \code{survfit.cph} assumes that \code{x=TRUE,y=TRUE}
were specified to the fit. This assures that the environment in effect
at the time of the fit (e.g., automatic knot estimation for spline functions)
is the same one used for basing predictions. Unlike \code{survfit.coxph},
the default basis for confidence intervals is \code{"log-log"}.
}
\usage{
survfit.cph(object, newdata, se.fit=TRUE, conf.int=0.95, 
            individual=FALSE, type, vartype,
            conf.type=c("log-log", "log", "plain", "none"))
}
\arguments{
\item{object}{
a fit object from \code{cph} or \code{coxph}
see \code{\link[survival]{survfit.coxph}}
}
\item{newdata}{}
\item{se.fit}{}
\item{conf.int}{}
\item{individual}{}
\item{type}{}
\item{vartype}{}
\item{conf.type}{see \code{\link[survival]{survfit}}}
}
\value{
see \code{survfit.coxph}
}
\seealso{
\code{\link{survest.cph}}
}
\keyword{survival}

\eof
\name{survplot}
\alias{survplot}
\alias{survplot.Design}
\alias{survplot.survfit}
\title{
Plot Survival Curves and Hazard Functions
}
\description{
Plot estimated survival curves, and for parametric survival models, plot
hazard functions.  There is an option to print the number of subjects
at risk at the start of each time interval.  Curves are automatically
labeled at the points of maximum separation (using the \code{labcurve}
function), and there are many other options for labeling that can be
specified with the \code{label.curves} parameter.  For example, different
plotting symbols can be placed at constant x-increments and a legend
linking the symbols with category labels can automatically positioned on
the most empty portion of the plot.
}
\usage{
survplot(fit, \dots)
\method{survplot}{Design}(fit, \dots, xlim,
         ylim=if(loglog) c(-5, 1.5) else if
                 (what == "survival" & missing(fun)) c(0, 1),
         xlab, ylab, time.inc,
         what=c("survival","hazard"),
         type=c("tsiatis","kaplan-meier"),
         conf.type=c("log-log","log","plain","none"),
         conf.int=FALSE, conf=c("bars","bands"),
         add=FALSE, label.curves=TRUE,
         abbrev.label=FALSE, lty, lwd=par("lwd"), col=1,
         adj.subtitle, loglog=FALSE, fun,
         n.risk=FALSE, logt=FALSE, dots=FALSE, dotsize=.003,
         grid=FALSE, srt.n.risk=0, sep.n.risk=0.056, adj.n.risk=1, 
         y.n.risk, cex.n.risk=.6, pr=FALSE)
\method{survplot}{survfit}(fit, xlim, 
         ylim, xlab, ylab, time.inc,
         conf=c("bars","bands","none"), add=FALSE, 
         label.curves=TRUE, abbrev.label=FALSE,
         lty,lwd=par('lwd'),col=1,
         loglog=FALSE,fun,n.risk=FALSE,logt=FALSE,
         dots=FALSE,dotsize=.003,
         grid=FALSE,
         srt.n.risk=0,sep.n.risk=.056,adj.n.risk=1,
         y.n.risk,cex.n.risk=.6, pr=FALSE, \dots)
}
\arguments{
\item{fit}{
result of fit (\code{cph}, \code{psm}, \code{survfit}, \code{survest.psm})
}
\item{\dots}{
list of factors with names used in model. For fits from \code{survfit}, these
arguments do not appear - all strata are plotted. Otherwise the first factor 
listed is
the factor used to determine different survival curves.  Any other factors
are used to specify single constants to be adjusted to, when defaults given
to fitting routine (through \code{limits}) are not used.  
The value given to factors is the original
coding of data given to fit, except that for categorical or strata
factors the text string levels may be specified.  The form
of values given to the first factor are \code{NA} (use default range or list of
all values if variable is discrete), \code{"text"} if factor is categorical,
\code{c(value1, value2, \dots)}, or a function which returns a vector, such as 
\code{seq(low,high,by=increment)}.  \code{NA} may be specified only for the first factor.
In this case the \code{Low effect}, \code{Adjust to}, and \code{High effect} values will
be used from \code{datadist} if the variable is continuous.
For variables not defined to \code{datadist}, you must specify non-missing
constant settings (or a vector of settings for the one displayed variable).
Note that since \code{survfit} objects do not use the variable list in \code{\dots},
you can specify any extra arguments to \code{labcurve} by adding them at the
end of the list of arguments.
}
\item{xlim}{
a vector of two numbers specifiying the x-axis range for follow-up time.
Default is \code{(0,maxtime)} where \code{maxtime} was the \code{pretty()}d version
of the maximum follow-up time
in any stratum, stored in \code{fit$maxtime}.  If \code{logt=TRUE},
default is \code{(1, log(maxtime))}.
}
\item{ylim}{
y-axis limits.  Default is \code{c(0,1)} for survival, and \code{c(-5,1.5)} if \code{loglog=TRUE}.
If \code{fun} or \code{loglog=TRUE} are given and \code{ylim} is not, 
the limits will be computed from the data.  For \code{what="hazard"}, default
limits are computed from the first hazard function plotted.
}
\item{xlab}{
x-axis label.  Default is \code{units} attribute of failure time variable given to
\code{Surv}.
}
\item{ylab}{
y-axis label.  Default is \code{"Survival Probability"} or 
\code{"log(-log Survival Probability)"}. If \code{fun} is given, the default
is \code{""}.  For \code{what="hazard"}, the default is \code{"Hazard Function"}.
}
\item{time.inc}{
time increment for labeling the x-axis and printing numbers at risk. 
If not specified, the value
of \code{time.inc} stored with the model fit will be used.
}
\item{type}{
specifies type of estimates, \code{"tsiatis"} (the default) or \code{"kaplan-meier"}.
\code{"tsiatis"} here corresponds to the Breslow
estimator. This is ignored if survival estimates stored with \code{surv=TRUE} are
being used. For fits from \code{survfit}, this argument is also ignored, since
it is specified as an argument to \code{survfit}.
}
\item{conf.type}{
specifies the basis for confidence limits. If estimates stored with \code{surv=TRUE}
are being used, always uses \code{"log-log"}, the default. This argument is
ignored for fits from \code{survfit}.
}
\item{conf.int}{
Default is \code{FALSE}.  Specify e.g. \code{.95} to plot 0.95 confidence bands.
For fits from parametric survival models, or Cox models with \code{x=TRUE} and \code{y=TRUE}
specified to the fit, the exact asymptotic formulas will be used to
compute standard errors, and confidence limits are based on \code{log(-log S(t))}.
If \code{x=TRUE} and \code{y=TRUE} were not specified to \code{cph} but \code{surv=TRUE} was, the
standard errors stored for the underlying survival curve(s) will be used.
These agree with the former if predictions are requested at the mean
value of X beta or if there are only stratification factors in the model.
This argument is ignored for fits from \code{survfit}, which must have previously
specified confidence interval specifications.
}
\item{conf}{
\code{"bars"} for confidence bars at each \code{time.inc} time point. If the fit
was from \code{cph(\dots, surv=TRUE)}, the \code{time.inc} used will be that stored
with the fit. Use \code{conf="bands"} for bands using
standard errors at each failure time. For \code{survfit} objects only,
\code{conf} may also be \code{"none"}, indicating that confidence interval
information stored with the \code{survfit} result should be ignored.
}
\item{what}{
defaults to \code{"survival"} to plot survival estimates.  Set to \code{"hazard"} or
an abbreviation to plot the hazard function (for \code{psm} fits only).
Confidence intervals are not available for \code{what="hazard"}.
}
\item{add}{
set to \code{TRUE} to add curves to an existing plot.
}
\item{label.curves}{
default is \code{TRUE} to use \code{labcurve} to label curves where they are farthest
apart.  Set \code{label.curves} to a \code{list} to specify options to
\code{labcurve}, e.g., \code{label.curves=list(method="arrow", cex=.8)}.
These option names may be abbreviated in the usual way arguments
are abbreviated.  Use for example \code{label.curves=list(keys=1:5)}
to draw symbols (as in \code{pch=1:5} - see \code{points})
on the curves and automatically position a legend
in the most empty part of the plot.  Set \code{label.curves=FALSE} to
suppress drawing curve labels.  The \code{col}, \code{lty}, \code{lwd}, and \code{type}
parameters are automatically passed to \code{labcurve}, although you
can override them here.  To distinguish curves by line types and
still have \code{labcurve} construct a legend, use for example
\code{label.curves=list(keys="lines")}.  The negative value for the
plotting symbol will suppress a plotting symbol from being drawn
either on the curves or in the legend.
}
\item{abbrev.label}{
set to \code{TRUE} to \code{abbreviate()} curve labels that are plotted
}
\item{lty}{
vector of line types to use for different factor levels.  Default is
\code{c(1,3,4,5,6,7,\dots)}.
}
\item{lwd}{
vector of line widths to use for different factor levels.  Default is
current \code{par} setting for \code{lwd}.
}
\item{col}{
color for curve, default is \code{1}.  Specify a vector to assign different
colors to different curves.
}
\item{adj.subtitle}{
set to \code{FALSE} to suppress plotting subtitle with levels of adjustment factors
not plotted. Defaults to \code{TRUE} if there are 4 or fewer adjustment factors.
This argument is ignored for \code{survfit}.
}
\item{loglog}{
set to \code{TRUE} to plot \code{log(-log Survival)} instead of \code{Survival}
}
\item{fun}{
specifies any function to translate estimates and confidence limits
before plotting
}
\item{logt}{
set to \code{TRUE} to plot \code{log(t)} instead of \code{t} on the x-axis
}
\item{n.risk}{
set to \code{TRUE} to add number of subjects at risk for each curve, using the
\code{surv.summary} created by \code{cph} or using the failure times used in
fitting the model if \code{y=TRUE} was specified to the fit or if the fit
was from \code{survfit}.
The numbers are placed at the bottom
of the graph unless \code{y.n.risk} is given. 
If the fit is from \code{survest.psm}, \code{n.risk} does not apply.
}
\item{srt.n.risk}{
angle of rotation for leftmost number of subjects at risk (since this number
may run into the second or into the y-axis).  Default is \code{0}.
}
\item{adj.n.risk}{
justification for leftmost number at risk. Default is \code{1} for right 
justification.
Use \code{0} for left justification, \code{.5} for centered.
}
\item{sep.n.risk}{
multiple of upper y limit - lower y limit for separating lines of text
containing number of subjects at risk.  Default is \code{.056*(ylim[2]-ylim[1])}.
}
\item{y.n.risk}{
When \code{n.risk=TRUE}, the default is to place numbers of patients at risk above
the x-axis.  You can specify a y-coordinate for the bottom line of the
numbers using \code{y.n.risk}.
}
\item{cex.n.risk}{
character size for number of subjects at risk (when \code{n.risk} is \code{TRUE})
}
\item{dots}{
set to \code{TRUE} to plot a grid of dots.  Will be plotted at every \code{time.inc} (see
\code{cph}) and at survival increments of .1 (if \code{d>.4}), .05 (if \code{.2 < d <= .4}), or .025
(if \code{d <= .2}), where \code{d} is the range of survival displayed.
}
\item{dotsize}{
size of dots in inches
}
\item{grid}{
defaults to \code{FALSE}. Set to a color shading to plot faint lines. Set to \code{1}
to plot solid lines.  Default is \code{.05} if \code{TRUE}.
}
\item{pr}{
set to \code{TRUE} to print survival curve coordinates used in the plots
}}
\value{
list with components adjust (text string specifying adjustment levels)
and \code{curve.labels} (vector of text strings corresponding to levels of factor
used to distinguish curves). For \code{survfit}, the returned value is the
vector of strata labels, or NULL if there are no strata.
}
\section{Side Effects}{
plots. If \code{par()$mar[4]<4}, issues \code{par(mar=)} to increment \code{mar[4]} by 2
if \code{n.risk=TRUE} and \code{add=FALSE}. The user may want to reset \code{par(mar)} in
this case to not leave such a wide right margin for plots. You usually
would issue \code{par(mar=c(5,4,4,2)+.1)}.
}
\details{
\code{survplot} will not work for Cox models with time-dependent covariables.
Use \code{survest} or \code{survfit} for that purpose.


Use \code{ps.slide}, \code{win.slide}, \code{gs.slide} to set up nice defaults for
plotting.  These also set a system option \code{mgp.axis.labels} to allow x
and y-axes to have differing \code{mgp} graphical parameters (see \code{par}).
This is important when labels for y-axis tick marks are to be written
horizontally (\code{par(las=1)}), as a larger gap between the labels and
the tick marks are needed.  You can set the axis-specific 2nd
component of \code{mgp} using \code{mgp.axis.labels(c(xvalue,yvalue))}.
}
\seealso{
\code{\link{datadist}}, \code{\link{Design}}, \code{\link{cph}},
\code{\link{psm}}, \code{\link{survest}}, \code{\link{predict.Design}},
\code{\link{plot.Design}}, 
\code{\link[Hmisc]{units}}, \code{\link[Hmisc]{errbar}},  
\code{\link{survfit}}, \code{\link[survival]{survreg.distributions}},
\code{\link[Hmisc]{labcurve}},
\code{\link[Hmisc]{mgp.axis.labels}}, \code{\link{par}},
\code{\link[Hmisc]{ps.slide}}
}
\examples{
# Simulate data from a population model in which the log hazard
# function is linear in age and there is no age x sex interaction
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('male','female'), n, TRUE))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
dt <- -log(runif(n))/h
label(dt) <- 'Follow-up Time'
e <- ifelse(dt <= cens,1,0)
dt <- pmin(dt, cens)
units(dt) <- "Year"
dd <- datadist(age, sex)
options(datadist='dd')
S <- Surv(dt,e)


#Plot stratified survival curves by sex, adj for quadratic age effect
# with age x sex interaction (2 d.f. interaction)


f <- cph(S ~ pol(age,2)*strat(sex), surv=TRUE)
#or f <- psm(S ~ pol(age,2)*sex)


survplot(f, sex=NA, n.risk=TRUE)           #Adjust age to median
survplot(f, sex=NA, logt=TRUE, loglog=TRUE)   #Check for Weibull-ness (linearity)
survplot(f, sex=c("male","female"), age=50)
                                        #Would have worked without datadist
                                        #or with an incomplete datadist
survplot(f, sex=NA, label.curves=list(keys=c(2,0), point.inc=2))
                                        #Identify curves with symbols


survplot(f, sex=NA, label.curves=list(keys=c('m','f')))
                                        #Identify curves with single letters


#Plots by quintiles of age, adjusting sex to male
options(digits=3)
survplot(f, age=quantile(age,seq(0,1,by=.2)), sex="male")


#Plot survival Kaplan-Meier survival estimates for males
f <- survfit(S, subset=sex=="male")
survplot(f)


#Plot survival for both sexes
f <- survfit(S ~ sex)
survplot(f)
#Check for log-normal and log-logistic fits
survplot(f, fun=qnorm, ylab="Inverse Normal Transform")
survplot(f, fun=function(y)log(y/(1-y)), ylab="Logit S(t)")


options(datadist=NULL)
}
\keyword{survival}
\keyword{hplot}
\keyword{nonparametric}
\keyword{models}
% Converted by Sd2Rd version 1.21.



\eof
\name{val.prob}
\alias{val.prob}
\alias{val.surv}
\alias{print.val.prob}
\alias{plot.val.prob}
\alias{plot.val.surv}
\title{
Validate Predicted Probabilities
}
\description{
The \code{val.prob} and \code{val.surv} functions are useful for validating
predicted probabilities against binary events and predicted survival
probabilities against right-censored failure times, respectively.
First \code{val.prob} is described.

Given a set of predicted probabilities \code{p} or predicted log odds
\code{logit}, and a vector of binary outcomes \code{y} that were not
used in developing the predictions \code{p} or \code{logit},
\code{val.prob} computes the following indexes and statistics: Somers'
\eqn{D_{xy}} rank correlation between \code{p} and \code{y}
[\eqn{2(C-.5)}, \eqn{C}=ROC area], Nagelkerke-Cox-Snell-Maddala-Magee
R-squared index, Discrimination index \code{D} [ (Logistic model
L.R. \eqn{\chi^2}{chi-square} - 1)/n], L.R. \eqn{\chi^2}{chi-square},
its \eqn{P}-value, Unreliability index \eqn{U}, \eqn{\chi^2}{chi-square}
with 2 d.f.  for testing unreliability (H0: intercept=0, slope=1), its
\eqn{P}-value, the quality index \eqn{Q}, \code{Brier} score (average
squared difference in \code{p} and \code{y}), \code{Intercept}, and
\code{Slope}, and \eqn{E_{max}}=maximum absolute difference in predicted
and calibrated probabilities.  If \code{pl=TRUE}, plots fitted logistic
calibration curve and optionally a smooth nonparametric fit using
\code{lowess(p,y,iter=0)} and grouped proportions vs.  mean predicted
probability in group.  If the predicted probabilities or logits are
constant, the statistics are returned and no plot is made.

When \code{group} is present, different statistics are computed,
different graphs are made, and the object returned by \code{val.prob} is
different.  \code{group} specifies a stratification variable.
Validations are done separately by levels of group and overall.  A
\code{print} method prints summary statistics and several quantiles of
predicted probabilities, and a \code{plot} method plots calibration
curves with summary statistics superimposed, along with selected
quantiles of the predicted probabilities (shown as tick marks on
calibration curves).  Only the \code{lowess} calibration curve is
estimated.  The statistics computed are the average predicted
probability, the observed proportion of events, a 1 d.f. chi-square
statistic for testing for overall mis-calibration (i.e., a test of the
observed vs. the overall average predicted probability of the event)
(\code{ChiSq}), and a 2 d.f. chi-square statistic for testing
simultaneously that the intercept of a linear logistic calibration curve
is zero and the slope is one (\code{ChiSq2}), average absolute
calibration error (average absolute difference between the
\code{lowess}-estimated calibration curve and the line of identity,
labeled \code{Eavg}), \code{Eavg} divided by the difference between the
0.95 and 0.05 quantiles of predictive probabilities (\code{Eavg/P90}), a
"median odds ratio", i.e., the anti-log of the median absolute
difference between predicted and calibrated predicted log odds of the
event (\code{Med OR}), the C-index (ROC area), the Brier quadratic error
score (\code{B}), a chi-square test of goodness of fit based on the
Brier score (\code{B ChiSq}), and the Brier score computed on calibrated rather than raw
predicted probabilities (\code{B cal}).  The first chi-square test is a
test of overall calibration accuracy ("calibration in the large"), and
the second will also detect errors such as slope shrinkage caused by
overfitting or regression to the mean.  See Cox (1970) for both of these
score tests.  The goodness of fit test based on the (uncalibrated) Brier
score is due to Hilden, Habbema, and Bjerregaard (1978) and is discussed
in Spiegelhalter (1986).  When \code{group} is present you can also
specify sampling \code{weights} (usually frequencies), to obtained
weighted calibration curves.

To get the behavior that results from a grouping variable being present
without having a grouping variable, use \code{group=TRUE}.  In the
\code{plot} method, calibration curves are drawn and labeled by default
where they are maximally separated using the \code{labcurve} function.
The following parameters do not apply when \code{group} is present:
\code{pl}, \code{smooth}, \code{logistic.cal}, \code{m}, \code{g},
\code{cuts}, \code{emax.lim}, \code{legendloc}, \code{riskdist},
\code{mkh}, \code{connect.group}, \code{connect.smooth}.  The following
parameters apply to the \code{plot} method but not to \code{val.prob}:
\code{xlab}, \code{ylab}, \code{lim}, \code{statloc}, \code{cex}.

\code{val.surv} uses Cox-Snell (1968) residuals on the cumulative
probability scale to check on the calibration of a survival model
against right-censored failure time data.  If the predicted survival
probability at time \eqn{t} for a subject having predictors \eqn{X} is
\eqn{S(t|X)}, this method is based on the fact that the predicted
probability of failure before time \eqn{t}, \eqn{1 - S(t|X)}, when
evaluated at the subject's actual survival time \eqn{T}, has a uniform
(0,1) distribution.  The quantity \eqn{1 - S(T|X)} is right-censored
when \eqn{T} is.  By getting one minus the Kaplan-Meier estimate of the
distribution of \eqn{1 - S(T|X)} and plotting against the 45 degree line
we can check for calibration accuracy.  A more stringent assessment can
be obtained by stratifying this analysis by an important predictor
variable.  The theoretical uniform distribution is only an approximation
when the survival probabilities are estimates and not population values.

When \code{censor} is specified to \code{val.surv}, a different
validation is done that is more stringent but that only uses the
uncensored failure times.  This method is used for type I censoring when
the theoretical censoring times are known for subjects having uncensored
failure times.  Let \eqn{T}, \eqn{C}, and \eqn{F} denote respectively
the failure time, censoring time, and cumulative failure time
distribution (\eqn{1 - S}).  The expected value of \eqn{F(T | X)} is 0.5
when \eqn{T} represents the subject's actual failure time.  The expected
value for an uncensored time is the expected value of \eqn{F(T | T \leq
C, X) = 0.5 F(C | X)}.  A smooth plot of \eqn{F(T|X) - 0.5 F(C|X)} for
uncensored \eqn{T} should be a flat line through \eqn{y=0} if the model
is well calibrated.  A smooth plot of \eqn{2F(T|X)/F(C|X)} for
uncensored \eqn{T} should be a flat line through \eqn{y=1.0}. The smooth
plot is obtained by smoothing the (linear predictor, difference or
ratio) pairs.
}
\usage{
val.prob(p, y, logit, group, weights=rep(1,length(y)), normwt=FALSE, 
         pl=TRUE, smooth=TRUE, logistic.cal=TRUE,
         xlab="Predicted Probability", ylab="Actual Probability",
         lim=c(0, 1), m, g, cuts, emax.lim=c(0,1),
         legendloc=lim[1] + c(0.55 * diff(lim), 0.27 * diff(lim)), 
         statloc=c(0,0.9), riskdist="calibrated", cex=.75, mkh=.02,
         connect.group=FALSE, connect.smooth=TRUE, g.group=4, 
         evaluate=100, nmin=0)

\method{print}{val.prob}(x, \dots)

\method{plot}{val.prob}(x, xlab="Predicted Probability", 
     ylab="Actual Probability",
     lim=c(0,1), statloc=lim, stats=1:12, cex=.5, 
     lwd.overall=4, quantiles=c(.05,.95), flag, \dots)

val.surv(fit, newdata, S, est.surv, censor)

\method{plot}{val.surv}(x, group, g.group=4, what=c('difference','ratio'),
     type=c('l','b','p'),
     xlab, ylab, xlim, ylim, datadensity=TRUE, \dots)
}
\arguments{
\item{p}{
predicted probability
}
\item{y}{
vector of binary outcomes
}
\item{logit}{
predicted log odds of outcome.  Specify either \code{p} or \code{logit}.
}
\item{fit}{
a fit object created by \code{cph} or \code{psm}
}
\item{group}{
a grouping variable.  If numeric this variable is grouped into
\code{g.group} quantile groups (default is quartiles).  Set \code{group=TRUE} to
use the \code{group} algorithm but with a single stratum for \code{val.prob}.
}
\item{weights}{
an optional numeric vector of per-observation weights (usually frequencies),
used only if \code{group} is given.
}
\item{normwt}{
set to \code{TRUE} to make \code{weights} sum to the number of non-missing observations.
}
\item{pl}{
TRUE to plot calibration curves and optionally statistics
}
\item{smooth}{
plot smooth fit to \code{(p,y)} using \code{lowess(p,y,iter=0)}
}
\item{logistic.cal}{
plot linear logistic calibration fit to \code{(p,y)}
}
\item{xlab}{
x-axis label, default is \code{"Predicted Probability"} for \code{val.prob}.
Other defaults are used by \code{val.surv}.
}
\item{ylab}{
y-axis label, default is \code{"Actual Probability"} for \code{val.prob}.  Other
defaults are used by \code{val.surv}.
}
\item{lim}{
limits for both x and y axes  
}
\item{m}{
If grouped proportions are desired, average no. observations per group
}
\item{g}{
If grouped proportions are desired, number of quantile groups
}
\item{cuts}{
If grouped proportions are desired, actual cut points for constructing
intervals, e.g. \code{c(0,.1,.8,.9,1)} or \code{seq(0,1,by=.2)}
}
\item{emax.lim}{
Vector containing lowest and highest predicted probability over which to
compute \code{Emax}.
}
\item{legendloc}{
If \code{pl=TRUE}, list with components \code{x,y} or vector \code{c(x,y)} for upper left corner
of legend for curves and points.  Default is \code{c(.55, .27)} scaled to
\code{lim}.  Use \code{locator(1)} to use the mouse, \code{FALSE} to suppress legend.
}
\item{statloc}{
\eqn{D_{xy}}, \eqn{C}, \eqn{R^2}, \eqn{D}, \eqn{U}, \eqn{Q}, \code{Brier} score, \code{Intercept}, \code{Slope}, and \eqn{E_{max}}
will be added to plot, using
\code{statloc} as the upper left corner of a box (default is \code{c(0,.9)}).
You can specify a list or a vector.  Use \code{locator(1)}
for the mouse, \code{FALSE} to suppress statistics.  
This is plotted after the curve legends.
}
\item{riskdist}{
Defaults to \code{"calibrated"} to plot the relative frequency distribution of
calibrated robabilities after dividing into 101 bins from \code{lim[1]} to
\code{lim[2]}.
Set to \code{"predicted"} to use raw assigned risk, \code{FALSE} to omit risk distribution.
Values are scaled so that highest bar is \code{0.15*(lim[2]-lim[1])}.
}
\item{cex}{
Character size for legend or for table of statistics when \code{group} is given
}
\item{mkh}{
Size of symbols for legend.   Default is 0.02 (see \code{par()}).
}
\item{connect.group}{
Defaults to \code{FALSE} to only represent group fractions as triangles.
Set to \code{TRUE} to also connect with a solid line.
}
\item{connect.smooth}{
Defaults to \code{TRUE} to draw smoothed estimates using a dashed line.
Set to \code{FALSE} to instead use dots at individual estimates.
}
\item{g.group}{
number of quantile groups to use when \code{group} is given and variable is
numeric.
}
\item{evaluate}{
number of points at which to store the \code{lowess}-calibration curve.
Default is 100.  If there are more than \code{evaluate} unique predicted
probabilities, \code{evaluate} equally-spaced quantiles of the unique
predicted probabilities, with linearly interpolated calibrated values,
are retained for plotting (and stored in the object returned by
\code{val.prob}.
}
\item{nmin}{
applies when \code{group} is given.  When \code{nmin} \eqn{> 0}, \code{val.prob} will not
store coordinates of smoothed calibration curves in the outer tails,
where there are fewer than \code{nmin} raw observations represented in
those tails.  If for example \code{nmin}=50, the \code{plot} function will only
plot the estimated calibration curve from \eqn{a} to \eqn{b}, where there are
50 subjects with predicted probabilities \eqn{< a} and \eqn{> b}.
\code{nmin} is ignored when computing accuracy statistics.
}
\item{x}{result of \code{val.prob} (with \code{group} in effect) or
  \code{val.surv}}
\item{\dots}{
optional arguments for \code{labcurve} (through \code{plot}).  Commonly used
options are \code{col} (vector of colors for the strata plus overall) and
\code{lty}.  Ignored for \code{print}.
}
\item{stats}{
vector of column numbers of statistical indexes to write on plot
}
\item{lwd.overall}{
line width for plotting the overall calibration curve
}
\item{quantiles}{
a vector listing which quantiles should be indicated on each
calibration curve using tick marks.  The values in \code{quantiles} can be
any number of values from the following: .01, .025, .05, .1, .25, .5, .75, .9, .95, .975, .99.
By default the 0.05 and 0.95 quantiles are indicated.
}
\item{flag}{
a function of the matrix of statistics (rows representing groups)
returning a vector of character strings (one value for each group, including
"Overall").  \code{plot.val.prob}
will print this vector of character values to the left of the
statistics.  The \code{flag} function 
can refer to columns of the matrix used as input to the function by
their names given in the description above.  The default function
returns \code{"*"} if either \code{ChiSq2} or \code{B ChiSq} is significant at the
0.01 level and \code{" "} otherwise.
}
\item{newdata}{
a data frame for which \code{val.surv} should obtain predicted survival
probabilities.  If omitted, survival estimates are made for all of the
subjects used in \code{fit}.
}
\item{S}{an \code{\link[survival]{Surv}} object}
\item{est.surv}{
a vector of estimated survival probabilities corresponding to times in
the first column of \code{S}.
}
\item{censor}{
a vector of censoring times.  Only the censoring times for uncensored
observations are used.
}
\item{what}{
the quantity to plot when \code{censor} was in effect.  The default is to
show the difference between cumulative probabilities and their
expectation given the censoring time.  Set \code{what="ratio"} to show the
ratio instead.
}
\item{type}{
Set to the default (\code{"l"}) to plot the trend line only, \code{"b"} to plot
both individual subjects ratios and trend lines, or \code{"p"} to plot only points.
}
\item{xlim}{
}
\item{ylim}{
axis limits for \code{plot.val.surv} when the \code{censor} variable was used.
}
\item{datadensity}{
By default, \code{plot.val.surv} will show the data density on each curve
that is created as a result of \code{censor} being present.  Set
\code{datadensity=FALSE} to suppress these tick marks drawn by \code{scat1d}.
}}
\value{
\code{val.prob} without \code{group} returns a vector with the following named
elements: \code{Dxy}, \code{R2}, \code{D}, \code{D:Chi-sq}, \code{D:p},
\code{U}, \code{U:Chi-sq}, \code{U:p}, \code{Q}, \code{Brier}, \code{Intercept}, \code{Slope}, \code{Emax}.
When \code{group} is present \code{val.prob} returns an object of class
\code{val.prob} containing a list with summary statistics and calibration
curves for all the strata plus \code{"Overall"}.  
}
\details{
The 2 d.f. \eqn{\chi^2}{chi-square} test and \code{Med OR} exclude predicted or
calibrated predicted probabilities \eqn{\leq 0} to zero or \eqn{\geq 1},
adjusting the sample size as needed.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\section{Side Effects}{
plots calibration curve
}
\references{
Harrell FE, Lee KL, Mark DB (1996): Multivariable prognostic models:
Issues in developing models, evaluating assumptions and adequacy, and
measuring and reducing errors.  Stat in Med 15:361--387.


Harrell FE, Lee KL (1987):  Using logistic calibration to assess the
accuracy of probability predictions (Technical Report).


Miller ME, Hui SL, Tierney WM (1991): Validation techniques for
logistic regression models.  Stat in Med 10:1213--1226.


Harrell FE, Lee KL (1985):  A comparison of the
\emph{discrimination}
of discriminant analysis and logistic regression under multivariate
normality.  In Biostatistics: Statistics in Biomedical, Public Health,
and Environmental Sciences.  The Bernard G. Greenberg Volume, ed. PK
Sen. New York: North-Holland, p. 333--343.


Cox DR (1970): The Analysis of Binary Data, 1st edition, section 4.4.
London: Methuen.


Spiegelhalter DJ (1986):Probabilistic prediction in patient management.
Stat in Med 5:421--433.


Cox DR, Snell EJ (1968):A general definition of residuals (with
discussion).  JRSSB 30:248--275.
}
\seealso{
\code{\link{validate.lrm}}, \code{\link{lrm.fit}}, \code{\link{lrm}}, \code{\link[Hmisc]{labcurve}}, \code{\link[Hmisc]{wtd.rank}},
\code{\link[Hmisc]{wtd.loess.noiter}}, \code{\link[Hmisc]{scat1d}}, \code{\link{cph}}, \code{\link{psm}}
}
\examples{
# Fit logistic model on 100 observations simulated from the actual 
# model given by Prob(Y=1 given X1, X2, X3) = 1/(1+exp[-(-1 + 2X1)]),
# where X1 is a random uniform [0,1] variable.  Hence X2 and X3 are 
# irrelevant.  After fitting a linear additive model in X1, X2,
# and X3, the coefficients are used to predict Prob(Y=1) on a
# separate sample of 100 observations.


set.seed(1)
n <- 200
x1 <- runif(n)
x2 <- runif(n)
x3 <- runif(n)
logit <- 2*(x1-.5)
P <- 1/(1+exp(-logit))
y <- ifelse(runif(n)<=P, 1, 0)
d <- data.frame(x1,x2,x3,y)
f <- lrm(y ~ x1 + x2 + x3, subset=1:100)
pred.logit <- predict(f, d[101:200,])
phat <- 1/(1+exp(-pred.logit))
val.prob(phat, y[101:200], m=20, cex=.5)  # subgroups of 20 obs.


# Validate predictions more stringently by stratifying on whether
# x1 is above or below the median


v <- val.prob(phat, y[101:200], group=x1[101:200], g.group=2)
v
plot(v)
plot(v, flag=function(stats) ifelse(
  stats[,'ChiSq2'] > qchisq(.95,2) |
  stats[,'B ChiSq'] > qchisq(.95,1), '*', ' ') )
# Stars rows of statistics in plot corresponding to significant
# mis-calibration at the 0.05 level instead of the default, 0.01


plot(val.prob(phat, y[101:200], group=x1[101:200], g.group=2), 
              col=1:3) # 3 colors (1 for overall)


# Weighted calibration curves
# plot(val.prob(pred, y, group=age, weights=freqs))


# Survival analysis examples
# Generate failure times from an exponential distribution
set.seed(123)              # so can reproduce results
n <- 2000
age <- 50 + 12*rnorm(n)
sex <- factor(sample(c('Male','Female'), n, rep=TRUE, prob=c(.6, .4)))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
t <- -log(runif(n))/h
label(t) <- 'Time to Event'
ev <- ifelse(t <= cens, 1, 0)
t <- pmin(t, cens)
S <- Surv(t, ev)


# First validate true model used to generate data
w <- val.surv(est.surv=exp(-h*t), S=S)
plot(w)
plot(w, group=sex)  # stratify by sex


# Now fit an exponential model and validate
# Note this is not really a validation as we're using the
# training data here
f <- psm(S ~ age + sex, dist='exponential', y=TRUE)
w <- val.surv(f)
plot(w, group=sex)


# We know the censoring time on every subject, so we can
# compare the predicted Pr[T <= observed T | T>c, X] to
# its expectation 0.5 Pr[T <= C | X] where C = censoring time
# We plot a ratio that should equal one
w <- val.surv(f, censor=cens)
plot(w)


plot(w, group=age, g=3)   # stratify by tertile of age
}
\keyword{models}
\keyword{regression}
\keyword{htest}
\keyword{smooth}
\keyword{survival}
\concept{model validation}
\concept{predictive accuracy}
\concept{logistic regression model}
\concept{sampling}

\eof
\name{validate}
\alias{validate}
\title{
Resampling Validation of a Fitted Model's Indexes of Fit
}
\description{
The \code{validate} function when used on an object created by one of the
\code{Design} series does resampling validation of a 
regression model, with or without backward step-down variable deletion.
It provides bias-corrected indexes that are specific to each type
of model. For \code{validate.cph} and \code{validate.psm}, see \code{validate.lrm},
which is similar. For \code{validate.cph} and \code{validate.psm}, there is
an extra argument \code{dxy}, which if \code{TRUE} causes the \code{rcorr.cens}
function to be invoked to compute the Somers' \eqn{D_{xy}} rank correlation
to be computed at each resample (this takes a bit longer than
the likelihood based statistics).  For \code{validate.cph} with \code{dxy=TRUE},
you must specify an argument \code{u} if the model is stratified, since
survival curves can then cross and \eqn{X\beta}{X beta} is not 1-1 with
predicted survival.   There is also \code{validate} method for
\code{tree}, which only does cross-validation and which has a different
list of arguments. 
}
\usage{
# fit <- fitting.function(formula=response ~ terms, x=TRUE, y=TRUE)
validate(fit, method="boot", B=40,
         bw=FALSE, rule="aic", type="residual", sls=0.05, aics=0, 
         pr=FALSE, \dots)
}
\arguments{
\item{fit}{
a fit derived by e.g. \code{lrm}, \code{cph}, \code{psm}, \code{ols}. The options \code{x=TRUE} and \code{y=TRUE}
must have been specified.
}
\item{method}{
may be \code{"crossvalidation"}, \code{"boot"} (the default), \code{".632"}, or
\code{"randomization"}.
See \code{predab.resample} for details.  Can abbreviate, e.g.
\code{"cross", "b", ".6"}.
}
\item{B}{
number of repetitions.  For \code{method="crossvalidation"}, is the
number of groups of omitted observations.
}
\item{bw}{
\code{TRUE} to do fast step-down using the \code{fastbw} function, for both the overall model and for each repetition. \code{fastbw} keeps parameters
together that represent the same factor.
}
\item{rule}{
Applies if \code{bw=TRUE}.  \code{"aic"} to use Akaike's information criterion as a
stopping rule (i.e., a factor is deleted if the \eqn{\chi^2}{chi-square} falls below
twice its degrees of freedom), or \code{"p"} to use \eqn{P}-values.
}
\item{type}{
\code{"residual"} or \code{"individual"} - stopping rule is for individual factors or
for the residual \eqn{\chi^2}{chi-square} for all variables deleted
}
\item{sls}{
significance level for a factor to be kept in a model, or for judging the
residual \eqn{\chi^2}{chi-square}.
}
\item{aics}{
cutoff on AIC when \code{rule="aic"}.
}
\item{pr}{
\code{TRUE} to print results of each repetition
}
\item{\dots}{
parameters for each specific validate function, and parameters to
pass to \code{predab.resample} (note especially the \code{group}, \code{cluster}, amd \code{subset} parameters).
For \code{psm}, you can pass the \code{maxiter} parameter here (passed to 
\code{survreg.control}, default is 15 iterations) as well as a \code{tol} parameter 
for judging matrix singularity in
\code{solvet} (default is 1e-12) and a \code{rel.tolerance} parameter that is passed
to \code{survreg.control} (default is 1e-5).
}}
\value{
a matrix with rows corresponding to the statistical indexes and
columns for columns for the original index, resample estimates, 
indexes applied to
the whole or omitted sample using the model derived from the resample,
average optimism, corrected index, and number of successful re-samples.
}
\section{Side Effects}{
prints a summary, and optionally statistics for each re-fit
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{validate.ols}}, \code{\link{validate.cph}}, \code{\link{validate.lrm}}, \code{\link{validate.tree}},
\code{\link{predab.resample}}, \code{\link{fastbw}}, \code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{calibrate}}
}
\examples{
# See examples for validate.cph, validate.lrm, validate.ols
# Example of validating a parametric survival model:


n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n, TRUE))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
dt <- -log(runif(n))/h
e <- ifelse(dt <= cens,1,0)
dt <- pmin(dt, cens)
units(dt) <- "Year"
S <- Surv(dt,e)


f <- psm(S ~ age*sex, x=TRUE, y=TRUE)  # Weibull model
# Validate full model fit
validate(f, B=10)                # usually B=150


# Validate stepwise model with typical (not so good) stopping rule
# bw=TRUE does not preserve hierarchy of terms at present
validate(f, B=10, bw=TRUE, rule="p", sls=.1, type="individual")
}
\keyword{models}
\keyword{regression}
\keyword{methods}
\keyword{survival}
\concept{model validation}
\concept{predictive accuracy}
\concept{bootstrap}

\eof
\name{validate.cph}
\alias{validate.cph}
\alias{validate.psm}
\title{
Validation of a Fitted Cox or Parametric Survival Model's Indexes of Fit
}
\description{
This is the version of the \code{validate} function specific to models
fitted with \code{cph} or \code{psm}. Statistics validated include the
Nagelkerke \eqn{R^2}, 
\eqn{D_{xy}}, slope shrinkage,  the
discrimination index\eqn{D} [(model L.R. \eqn{\chi^2}{chi-square} - 1)/L], the unreliability index
\eqn{U} = (difference in -2 log likelihood between uncalibrated
\eqn{X\beta}{X beta} and  
\eqn{X\beta}{X beta} with overall slope calibrated to test sample) / L,
and the overall quality index \eqn{Q = D - U}. 
L is -2 log likelihood with beta=0.  The "corrected" slope
can be thought of as shrinkage factor that takes into account overfitting.
See \code{predab.resample} for the list of resampling methods.
}
\usage{
# fit <- cph(formula=Surv(ftime,event) ~ terms, x=TRUE, y=TRUE, \dots)
\method{validate}{cph}(fit,method="boot",
                         B=40,bw=FALSE,rule="aic",type="residual",
                         sls=.05,aics=0,pr=FALSE,dxy=FALSE,u,tol=1e-9, \dots)

\method{validate}{psm}(fit, method="boot",B=40,
                   bw=FALSE,rule="aic",type="residual",sls=.05,aics=0,pr=FALSE,
                   dxy=FALSE,tol=1e-12, rel.tolerance=1e-5, maxiter=15, \dots)
}
\arguments{
\item{fit}{
a fit derived \code{cph}. The options \code{x=TRUE} and \code{y=TRUE}
must have been specified. If the model contains any stratification factors
and dxy=TRUE,
the options \code{surv=TRUE} and \code{time.inc=u} must also have been given,
where \code{u} is the same value of \code{u} given to \code{validate}.
}
\item{method}{see \code{\link{validate}}}
\item{B}{}
\item{rel.tolerance}{}
\item{maxiter}{}
\item{bw}{}
\item{rule}{}
\item{type}{}
\item{sls}{}
\item{aics}{}
\item{pr}{}
\item{tol}{}
\item{...}{see \code{\link{validate}} or \code{\link{predab.resample}}}
\item{dxy}{
set to \code{TRUE} to validate Somers' \eqn{D_{xy}}  using
\code{rcorr.cens}, which takes longer.
}
\item{u}{
must be specified if the model has any stratification factors and \code{dxy=TRUE}.
In that case, strata are not included in \eqn{X\beta}{X beta} and the
survival curves may cross.  Predictions at time \code{t=u} are
correlated with observed survival times.  Does not apply to
\code{validate.psm}.
}
}
\value{
matrix with rows corresponding to \eqn{D_{xy}}, Slope, \eqn{D},
\eqn{U}, and \eqn{Q}, and columns for the original index, resample estimates, 
indexes applied to whole or omitted sample using model derived from resample, average optimism, corrected index, and number
of successful resamples.
}
\section{Side Effects}{
prints a summary, and optionally statistics for each re-fit (if \code{pr=TRUE})
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{validate}}, \code{\link{predab.resample}}, \code{\link{fastbw}}, \code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{calibrate}},
\code{\link[Hmisc]{rcorr.cens}}, \code{\link{cph}}, \code{\link[survival]{coxph.fit}}
}
\examples{
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n, TRUE))
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
dt <- -log(runif(n))/h
e <- ifelse(dt <= cens,1,0)
dt <- pmin(dt, cens)
units(dt) <- "Year"
S <- Surv(dt,e)


f <- cph(S ~ age*sex, x=TRUE, y=TRUE)
# Validate full model fit
validate(f, B=10)               # normally B=150


# Validate a model with stratification.  Dxy is the only
# discrimination measure for such models, by Dxy requires
# one to choose a single time at which to predict S(t|X)
f <- cph(S ~ rcs(age)*strat(sex), 
         x=TRUE, y=TRUE, surv=TRUE, time.inc=2)
validate(f, dxy=TRUE, u=2, B=10)   # normally B=150
# Note u=time.inc
}
\keyword{models}
\keyword{regression}
\keyword{survival}
\concept{model validation}
\concept{predictive accuracy}
\concept{bootstrap}

\eof
\name{validate.lrm}
\alias{validate.lrm}
\title{
Resampling Validation of a Logistic Model
}
\description{
The \code{validate} function when used on an object created by \code{lrm}
does resampling validation of a logistic
regression model, with or without backward step-down variable deletion.
It provides bias-corrected Somers' \eqn{D_{xy}} rank correlation, 
R-squared index, the intercept and slope of an overall logistic
calibration equation, the maximum absolute difference in predicted and
calibrated probabilities \eqn{E_{max}}, the discrimination index \eqn{D}
(model L.R. \eqn{(\chi^2 - 1)/n}{(chi-square - 1)/n}, the unreliability
index \eqn{U}  = 
difference in -2 log likelihood between un-calibrated \eqn{X\beta}{X
  beta} and \eqn{X\beta}{X beta} with overall intercept and slope
calibrated to test sample) / n},  
the overall quality index (logarithmic probability score) \eqn{Q = D - U},
and the Brier or quadratic probability score, \eqn{B} (the last 3 are not
computed for ordinal models).  The
corrected slope can be thought of as shrinkage factor that takes
into account overfitting. 
}
\usage{
# fit <- lrm(formula=response ~ terms, x=TRUE, y=TRUE)
\method{validate}{lrm}(fit, method="boot", B=40,
         bw=FALSE, rule="aic", type="residual", sls=0.05, aics=0, 
         pr=FALSE,  kint, Dxy.method=if(k==1) 'somers2' else 'lrm',
         emax.lim=c(0,1), \dots)
}
\arguments{
\item{fit}{
a fit derived by \code{lrm}. The options \code{x=TRUE} and \code{y=TRUE}
must have been specified.
}
\item{method}{}
\item{B}{}
\item{bw}{}
\item{rule}{}
\item{type}{}
\item{sls}{}
\item{aics}{}
\item{pr}{see \code{\link{validate}} and \code{\link{predab.resample}}}
\item{kint}{
In the case of an ordinal model, specify which intercept to validate.
Default is the middle intercept.
}
\item{Dxy.method}{
\code{"lrm"} to use \code{lrm}s computation of \eqn{D_{xy}} correlation,
which rounds 
predicted probabilities to nearest .002.  Use \code{Dxy.method="somers2"} (the
default) to instead use the more accurate but slower \code{somers2} function.  This
will matter most when the model is extremely predictive.
The default is \code{"lrm"} for ordinal models, since \code{somers2} only handles
binary response variables.
}
\item{emax.lim}{
range of predicted probabilities over which to compute the maximum error.  Default is entire range.
}
\item{\dots}{
other arguments to pass to \code{lrm.fit} (now only \code{maxit} and \code{tol} are
allowed) and to \code{predab.resample} (note especially the \code{group},
\code{cluster}, and \code{subset} parameters)
}}
\value{
a matrix with rows corresponding to \eqn{D_{xy}},
\eqn{R^2}, \code{Intercept}, \code{Slope}, \eqn{E_{max}}, \eqn{D},
\eqn{U}, \eqn{Q}, amd \eqn{B}, and
columns for the original index, resample estimates, indexes applied to
the whole or omitted sample using the model derived from the resample,
average optimism, corrected index, and number of successful re-samples.
For ordinal models, \eqn{U, Q, B} to not appear.
}
\section{Side Effects}{
prints a summary, and optionally statistics for each re-fit
}
\details{
If the original fit was created using penalized maximum likelihood estimation,
the same \code{penalty.matrix} used with the original
fit are used during validation.
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\references{
Miller ME, Hui SL, Tierney WM (1991): Validation techniques for
logistic regression models.  Stat in Med 10:1213--1226.


Harrell FE, Lee KL (1985):  A comparison of the
\emph{discrimination}
of discriminant analysis and logistic regression under multivariate
normality.  In Biostatistics: Statistics in Biomedical, Public Health,
and Environmental Sciences.  The Bernard G. Greenberg Volume, ed. PK
Sen. New York: North-Holland, p. 333--343.
}
\seealso{
\code{\link{predab.resample}}, \code{\link{fastbw}}, \code{\link{lrm}}, \code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{calibrate}},
\code{\link[Hmisc]{somers2}}, \code{\link{cr.setup}}
}
\examples{
n <- 1000    # define sample size
age            <- rnorm(n, 50, 10)
blood.pressure <- rnorm(n, 120, 15)
cholesterol    <- rnorm(n, 200, 25)
sex            <- factor(sample(c('female','male'), n,TRUE))


# Specify population model for log odds that Y=1
L <- .4*(sex=='male') + .045*(age-50) +
  (log(cholesterol - 10)-5.2)*(-2*(sex=='female') + 2*(sex=='male'))
# Simulate binary y to have Prob(y=1) = 1/[1+exp(-L)]
y <- ifelse(runif(n) < plogis(L), 1, 0)


f <- lrm(y ~ sex*rcs(cholesterol)+pol(age,2)+blood.pressure, x=TRUE, y=TRUE)
#Validate full model fit
validate(f, B=10)              # normally B=150
validate(f, B=10, group=y)  
# two-sample validation: make resamples have same numbers of
# successes and failures as original sample


#Validate stepwise model with typical (not so good) stopping rule
validate(f, B=10, bw=TRUE, rule="p", sls=.1, type="individual")


\dontrun{
#Fit a continuation ratio model and validate it for the predicted
#probability that y=0
u <- cr.setup(y)
Y <- u$y
cohort <- u$cohort
attach(mydataframe[u$subs,])
f <- lrm(Y ~ cohort+rcs(age,4)*sex, penalty=list(interaction=2))
validate(f, cluster=u$subs, subset=cohort=='all') 
#see predab.resample for cluster and subset
}
}
\keyword{models}
\keyword{regression}
\concept{logistic regression model}
\concept{model validation}
\concept{predictive accuracy}
\concept{bootstrap}

\eof
\name{validate.ols}
\alias{validate.ols}
\title{
Validation of an Ordinary Linear Model
}
\description{
The \code{validate} function when used on an object created by \code{ols}
does resampling validation of a multiple linear
regression model, with or without backward step-down variable deletion.
Uses resampling to estimate the optimism in various measures of
predictive accuracy which include \eqn{R^2}, \eqn{MSE} (mean squared error with
a denominator of \eqn{n}),
and the intercept
and slope of an overall calibration \eqn{a + b\hat{y}}{a + b * (predicted y)}.  The "corrected" slope
can be thought of as shrinkage factor that takes into account overfitting.
\code{validate.ols} can also be used when a model for a continuous response
is going to be applied to a binary response. A Somers' \eqn{D_{xy}} for this case
is computed for each resample by dichotomizing \code{y}. This can be used to
obtain an ordinary receiver operating characteristic curve area using
the formula \eqn{0.5(D_{xy} + 1)}. The Nagelkerke-Maddala \eqn{R^2} index for
the dichotomized \code{y} is also given.
See \code{predab.resample} for the list of resampling methods.
}
\usage{
# fit <- fitting.function(formula=response ~ terms, x=TRUE, y=TRUE)
\method{validate}{ols}(fit, method="boot", B=40,
         bw=FALSE, rule="aic", type="residual", sls=0.05, aics=0, 
         pr=FALSE, u=NULL, rel=">", tolerance=1e-7, \dots)
}
\arguments{
\item{fit}{
a fit derived by \code{ols}. The options \code{x=TRUE} and \code{y=TRUE}
must have been specified.  See \code{validate} for a description of
arguments \code{method} - \code{pr}.
}
\item{method}{}
\item{B}{}
\item{bw}{}
\item{rule}{}
\item{type}{}
\item{sls}{}
\item{aics}{}
\item{pr}{see \code{\link{validate}} and \code{\link{predab.resample}}}
\item{u}{
If specifed, \code{y} is also dichotomized at the cutoff \code{u} for
the purpose of getting a bias-corrected estimate of \eqn{D_{xy}}.
}
\item{rel}{
relationship for dichotomizing predicted \code{y}. Defaults to
\code{">"} to use \code{y>u}. \code{rel} can also be \code{"<"},
\code{">="}, and \code{"<="}. 
}
\item{tolerance}{
tolerance for singularity; passed to \code{lm.fit.qr}.
}
\item{\dots}{
other arguments to pass to \code{predab.resample}, such as \code{group}, \code{cluster}, and \code{subset}
}}
\value{
matrix with rows corresponding to R-square, MSE, intercept, slope, and 
optionally \eqn{D_{xy}} and \eqn{R^2}, and
columns for the original index, resample estimates, 
indexes applied to whole or omitted sample using model derived from resample, average optimism, corrected index, and number
of successful resamples.
}
\section{Side Effects}{
prints a summary, and optionally statistics for each re-fit
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link{ols}}, \code{\link{predab.resample}}, \code{\link{fastbw}}, \code{\link{Design}}, \code{\link{Design.trans}}, \code{\link{calibrate}}
}
\examples{
set.seed(1)
x1 <- runif(200)
x2 <- sample(0:3, 200, TRUE)
x3 <- rnorm(200)
distance <- (x1 + x2/3 + rnorm(200))^2


f <- ols(sqrt(distance) ~ rcs(x1,4) + scored(x2) + x3, x=TRUE, y=TRUE)


#Validate full model fit (from all observations) but for x1 < .75
validate(f, B=20, subset=x1 < .75)   # normally B=150


#Validate stepwise model with typical (not so good) stopping rule
validate(f, B=20, bw=TRUE, rule="p", sls=.1, type="individual")
}
\keyword{models}
\keyword{regression}
\concept{model validation}
\concept{bootstrap}
\concept{predictive accuracy}

\eof
\name{validate.tree}
\alias{validate.tree}
\alias{validate.rpart}
\alias{print.validate.tree}
\alias{plot.validate.tree}
\title{
Dxy and Mean Squared Error by Cross-validating a Tree Sequence
}
\description{
Uses \code{xval}-fold cross-validation of a sequence of trees to derive
estimates of the mean squared error and Somers' \code{Dxy} rank correlation
between predicted and observed responses.  In the case of a binary response
variable, the mean squared error is the Brier accuracy score.
This function is a modification of \code{cv.tree} which should be
consulted for details.  There are \code{print} and \code{plot} methods for
objects created by \code{validate.tree}.
}
\usage{
# f <- tree(formula=y ~ x1 + x2 + \dots) # or rpart
\method{validate}{tree}(fit, method, B, bw, rule, type, sls, aics, pr=TRUE,
    k, rand, xval=10, FUN, \dots)
\method{validate}{rpart}(fit, \dots)
\method{print}{validate.tree}(x, \dots)
\method{plot}{validate.tree}(x, what=c("mse","dxy"), legendloc=locator, \dots)
}
\arguments{
\item{fit}{
an object created by \code{tree} or \code{rpart} or having the same
attributes as one created by \code{tree}.  If it was created by
\code{rpart} you must have specified the \code{model=TRUE} argument to
\code{rpart}. 
}
\item{method,B,bw,rule,type,sls,aics}{are there only for consistency
  with the generic \code{validate} function; these are ignored}
\item{x}{the result of \code{validate.tree}}
\item{k}{
a sequence of cost/complexity values.  By default these are obtained
from calling \code{FUN} with no optional arguments (if \code{tree}) or
from the \code{rpart} \code{cptable} object in the original fit object.
You may also specify a scalar or vector.
}
\item{rand}{
see \code{cv.tree}
}
\item{xval}{
number of splits
}
\item{FUN}{
the name of a function which produces a sequence of trees, such
as \code{prune.tree} or \code{shrink.tree} or \code{prune.rpart}.  Default is
\code{prune.tree} for fits from \code{tree} and \code{prune.rpart} for fits from \code{rpart}.
}
\item{\dots}{
additional arguments to \code{FUN} (ignored by \code{print,plot}).  For
\code{validate.rpart}, \dots can be the same arguments used in
\code{validate.tree}.
}
\item{pr}{
set to \code{FALSE} to prevent intermediate results for each \code{k} to be printed
}
\item{what}{
a vector of things to plot.  By default, 2 plots will be done, one for
\code{mse} and one for \code{Dxy}.
}
\item{legendloc}{
a function that is evaluated with a single argument equal to \code{1} to
generate a list with components \code{x, y} specifying coordinates of the
upper left corner of a legend, or a 2-vector.  For the latter,
\code{legendloc} specifies the relative fraction of the plot at which to
center the legend.
}}
\value{
a list of class \code{"validate.tree"} with components named \code{k, size, dxy.app},
\code{dxy.val, mse.app, mse.val, binary, xval}.  \code{size} is the number of nodes,
\code{dxy} refers to Somers' \code{D}, \code{mse} refers to mean squared error of prediction,
\code{app} means apparent accuracy on training samples, \code{val} means validated
accuracy on test samples, \code{binary} is a logical variable indicating whether
or not the response variable was binary (a logical or 0/1 variable is
binary).  \code{size} will not be present if the user specifies \code{k}.
}
\section{Side Effects}{
prints if \code{pr=TRUE}
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\seealso{
\code{\link[rpart]{rpart}}, \code{\link[Hmisc]{somers2}},
\code{\link[Hmisc]{rcorr.cens}}, \code{\link[tree]{cv.tree}},
\code{\link{locator}}, \code{\link{legend}}
}
\examples{
\dontrun{
n <- 100
set.seed(1)
x1 <- runif(n)
x2 <- runif(n)
x3 <- runif(n)
y  <- 1*(x1+x2+rnorm(n) > 1)
table(y)
library(rpart)
f <- rpart(y ~ x1 + x2 + x3, model=TRUE)
v <- validate(f)
v    # note the poor validation
par(mfrow=c(1,2))
plot(v, legendloc=c(.2,.5))
par(mfrow=c(1,1))
}
}
\keyword{models}
\keyword{tree}
\keyword{category}
\concept{model validation}
\concept{predictive accuracy}

\eof
\name{vif}
\alias{vif}
\title{
Variance Inflation Factors
}
\description{
Computes variance inflation factors from the covariance matrix of
parameter estimates, using the method of Davis et al. (1986), which
is based on the correlation matrix from the information matrix.
}
\usage{
vif(fit)
}
\arguments{
\item{fit}{
an object created by \code{lrm}, \code{ols}, \code{psm}, \code{cph}, or \code{glm}
}}
\value{
vector of vifs
}
\author{
Frank Harrell
\cr
Department of Biostatistics
\cr
Vanderbilt University
\cr
f.harrell@vanderbilt.edu
}
\references{
Davis CE, Hyde JE, Bangdiwala SI, Nelson JJ: An example of dependencies 
among variables in a conditional logistic regression. In Modern
Statistical Methods in Chronic Disease Epidemiology, Eds SH Moolgavkar and
RL Prentice, pp. 140--147.  New York: Wiley; 1986.
}
\seealso{
\code{\link{Design.Misc}} (for \code{\link{num.intercepts}} and \code{\link[Hmisc]{Varcov}})
}
\examples{
set.seed(1)
x1 <- rnorm(100)
x2 <- x1+.1*rnorm(100)
y  <- sample(0:1, 100, TRUE)
f  <- lrm(y ~ x1 + x2)
vif(f)
}
\keyword{models}
\keyword{regression}


\eof
\name{which.influence}
\alias{which.influence}
\alias{show.influence}
\title{
Which Observations are Influential
}
\description{
Creates a list with a component for
each factor in the model.  The names of the components are the factor
names.  Each component contains the observation identifiers of all
observations that are "overly influential" with respect to that factor,
meaning that \eqn{|dfbetas| > u} for at least one \eqn{\beta_i}{beta i}
associated with that factor, for a given \code{cutoff}.  The default \code{cutoff}
is \code{.2}.  The fit must come from a function that has
\code{resid(fit, type="dfbetas")} defined. 


\code{show.influence}, written by Jens Oehlschlaegel-Akiyoshi, applies the
result of \code{which.influence} to a data frame, usually the one used to
fit the model, to report the results.
}
\usage{
which.influence(fit, cutoff=.2)

show.influence(object, dframe, report=NULL, sig=NULL, id=NULL)
}
\arguments{
\item{fit}{
fit object
}
\item{object}{
the result of \code{which.influence}
}
\item{dframe}{
data frame containing observations pertinent to the model fit
}
\item{cutoff}{
cutoff value
}
\item{report}{
other columns of the data frame to report besides those corresponding
to predictors that are influential for some observations
}
\item{sig}{
runs results through \code{signif} with \code{sig} digits if \code{sig} is given
}
\item{id}{
a character vector that labels rows of \code{dframe} if \code{row.names} were
not used
}}
\value{
\code{show.influence} returns a marked dataframe with the first column being
a count of influence values
}
\author{
Frank Harrell\cr
Department of Biostatistics, Vanderbilt University\cr
f.harrell@vanderbilt.edu
\cr

Jens Oehlschlaegel-Akiyoshi\cr
Center for Psychotherapy Research\cr
Christian-Belser-Strasse 79a\cr
D-70597 Stuttgart Germany\cr
oehl@psyres-stuttgart.de
}
\seealso{
\code{\link{residuals.lrm}}, \code{\link{residuals.cph}}, \code{\link{residuals.ols}}, \code{\link{Design}}, \code{\link{lrm}}, \code{\link{ols}}, \code{\link{cph}}
}
\examples{
#print observations in data frame that are influential,
#separately for each factor in the model
x1 <- 1:20
x2 <- abs(x1-10)
x3 <- factor(rep(0:2,length.out=20))
y  <- c(rep(0:1,8),1,1,1,1)
f  <- lrm(y ~ rcs(x1,3) + x2 + x3, x=TRUE,y=TRUE)
w <- which.influence(f, .55)
nam <- names(w)
d   <- data.frame(x1,x2,x3,y)
for(i in 1:length(nam)) {
 print(paste("Influential observations for effect of ",nam[i]),quote=FALSE)
 print(d[w[[i]],])
}


show.influence(w, d)  # better way to show results
}
\keyword{models}
\keyword{regression}
\keyword{survival}
\concept{logistic regression model}

\eof
