\name{DLBCL}
\alias{DLBCL}
\non_function{}
\title{ Diffuse Large B-Cell Lymphoma }
\usage{data(DLBCL)}
\description{
A data frame with gene expression data from diffuse large B-cell
lymphoma (DLBCL) patients.
}
\format{
  This data frame contains the following columns:
  \describe{
    \item{DLCL.Sample}{DLBCL identifier.}
    \item{Gene.Expression}{Gene expression group.}
    \item{time}{survival time in month.}
    \item{cens}{censoring: 0 censored, 1 dead.}
    \item{IPI}{International prognostic index.}
    \item{MGEc.1}{mean gene expression in cluster 1.}
    \item{MGEc.2}{mean gene expression in cluster 2.}
    \item{MGEc.3}{mean gene expression in cluster 3.}
    \item{MGEc.4}{mean gene expression in cluster 4.}
    \item{MGEc.5}{mean gene expression in cluster 5.}
    \item{MGEc.6}{mean gene expression in cluster 6.}
    \item{MGEc.7}{mean gene expression in cluster 7.}
    \item{MGEc.8}{mean gene expression in cluster 8.}
    \item{MGEc.9}{mean gene expression in cluster 9.}
    \item{MGEc.10}{mean gene expression in cluster 10.}
  }
}
\source{
Except of \code{MGE}, the data is published at
\url{http://llmpp.nih.gov/lymphoma/data.shtml}. \code{MGEc.*} is the mean of
the gene expression in each of ten clusters derived by agglomerative average
linkage hierarchical cluster analysis (Hothorn et al., 2002).

}
\references{
Ash A. Alizadeh et. al (2000), Distinct types of diffuse large
B-cell lymphoma identified by gene
expression profiling. \emph{Nature}, \bold{403}, 504--509.

Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
Radespiel-Troeger (2002), Bagging Survival Trees. 
\emph{Statistics in Medicine} (accepted).
Preprint available from
\url{http://www.mathpreprints.com/math/Preprint/blausen/20020518/2}.


}
\examples{
data(DLBCL)
survfit(Surv(time, cens), data=DLBCL)

}
\keyword{datasets}
\eof
\name{GBSG2}
\alias{GBSG2}
\non_function{}
\title{ German Breast Cancer Study Group 2 }
\usage{data(GBSG2)}
\description{
  A data frame containing the observations from the  GBSG2 study.
}
\format{
  This data frame contains the observations of 686 women:
  \describe{
    \item{horTh}{hormonal therapy, a factor at two levels \code{no} and
\code{yes}.}
    \item{age}{of the patients in years.}
    \item{menostat}{menopausal status, a factor at two levels \code{pre} 
(premenopausal) and \code{post} (postmenopausal).}
    \item{tsize}{tumor size (in mm).}
    \item{tgrade}{tumor grade, a ordered factor at levels \code{I < II <
III}.}
    \item{pnodes}{number of positive nodes.}
    \item{progrec}{progesterone receptor (in fmol).}
    \item{estrec}{estrogen receptor (in fmol).}
    \item{time}{recurrence free survival time (in days).}
    \item{cens}{censoring indicator (0- censored, 1- event).}
  }
}
\source{
  \url{http://www.blackwellpublishers.com/rss/Volumes/A162p1.htm}
}
\references{
  M. Schumacher, G. Basert, H. Bojar,  K. Huebner, M. Olschewski, 
  W. Sauerbrei, C. Schmoor, C. Beyerle, R.L.A. Neumann and H.F. Rauschecker
  for the German Breast Cancer Study Group (1994),
  Randomized \eqn{2\times2} trial evaluating hormonal treatment
  and the duration of chemotherapy in node-positive breast cancer patients.
  \emph{Journal of Clinical Oncology}, \bold{12}, 2086--2093.

  W. Sauerbrei and P. Royston (1999). Building multivariable prognostic
  and diagnostic models: transformation of the predictors by using 
  fractional polynomials. \emph{Journal of the Royal Statistics Society
  Series A}, Volume \bold{162}(1), 71--94.
}
\examples{
data(GBSG2)

thsum <- function(x) {
  ret <- c(median(x), quantile(x, 0.25), quantile(x,0.75))
  names(ret)[1] <- "Median"
  ret
}

t(apply(GBSG2[,c("age", "tsize", "pnodes", 
                 "progrec", "estrec")], 2, thsum))

table(GBSG2$menostat)
table(GBSG2$tgrade)
table(GBSG2$horTh)

# pooled Kaplan-Meier

mod <- survfit(Surv(time, cens), data=GBSG2)
# integrated Brier score
sbrier(Surv(GBSG2$time, GBSG2$cens), mod)
# Brier score at 5 years
sbrier(Surv(GBSG2$time, GBSG2$cens), mod, btime=1825)

# Nottingham prognostic index

GBSG2 <- GBSG2[order(GBSG2$time),]

NPI <- 0.2*GBSG2$tsize/10 + 1 + as.integer(GBSG2$tgrade)
NPI[NPI < 3.4] <- 1
NPI[NPI >= 3.4 & NPI <=5.4] <- 2
NPI[NPI > 5.4] <- 3

mod <- survfit(Surv(time, cens) ~ NPI, data=GBSG2)
plot(mod)

pred <- c()
survs <- c()
for (i in sort(unique(NPI)))
    survs <- c(survs, getsurv(mod[i], 1825))

for (i in 1:nrow(GBSG2))
   pred <- c(pred, survs[NPI[i]])

# Brier score of NPI at t=5 years
sbrier(Surv(GBSG2$time, GBSG2$cens), pred, btime=1825)


}
\keyword{datasets}
\eof
\name{GlaucomaM}
\alias{GlaucomaM}
\non_function{}
\title{ Glaucoma Database }
\usage{data(GlaucomaM)}
\description{
The \code{GlaucomaM} data has 196 observations in two classes. 
62 variables are derived from a confocal laser scanning image of the 
optic nerve head, describing its morphology. Observations are from 
normal and glaucomatous eyes, respectively. 
}
\format{
  This data frame contains the following predictors describing the
  morphology of the optic nerve head and a membership variable:
  \describe{
    \item{ag}{area global.}
    \item{at}{area temporal.}
    \item{as}{area superior.}
    \item{an}{area nasal.}
    \item{ai}{area inferior.}
    \item{eag}{effective area global.}
    \item{eat}{effective area temporal.}
    \item{eas}{effective area superior.}
    \item{ean}{effective area nasal.}
    \item{eai}{effective area inferior.}
    \item{abrg}{area below reference global.}
    \item{abrt}{area below reference temporal.}
    \item{abrs}{area below reference superior.}
    \item{abrn}{area below reference nasal.}
    \item{abri}{area below reference inferior.}
    \item{hic}{height in contour.}
    \item{mhcg}{mean height contour global.}
    \item{mhct}{mean height contour temporal.}
    \item{mhcs}{mean height contour superior.}
    \item{mhcn}{mean height contour nasal.}
    \item{mhci}{mean height contour inferior.}
    \item{phcg}{peak height contour.}
    \item{phct}{peak height contour temporal.}
    \item{phcs}{peak height contour superior.}
    \item{phcn}{peak height contour nasal.}
    \item{phci}{peak height contour inferior.}
    \item{hvc}{height variation contour.}
    \item{vbsg}{volume below surface global.}
    \item{vbst}{volume below surface temporal.}
    \item{vbss}{volume below surface superior.}
    \item{vbsn}{volume below surface nasal.}
    \item{vbsi}{volume below surface inferior.}
    \item{vasg}{volume above surface global.}
    \item{vast}{volume above surface temporal.}
    \item{vass}{volume above surface superior.}
    \item{vasn}{volume above surface nasal.}
    \item{vasi}{volume above surface inferior.}
    \item{vbrg}{volume below reference global.}
    \item{vbrt}{volume below reference temporal.}
    \item{vbrs}{volume below reference superior.}
    \item{vbrn}{volume below reference nasal.}
    \item{vbri}{volume below reference inferior.}
    \item{varg}{volume above reference global.}
    \item{vart}{volume above reference temporal.}
    \item{vars}{volume above reference superior.}
    \item{varn}{volume above reference nasal.}
    \item{vari}{volume above reference inferior.}
    \item{mdg}{mean depth global.}
    \item{mdt}{mean depth temporal.}
    \item{mds}{mean depth superior.}
    \item{mdn}{mean depth nasal.}
    \item{mdi}{mean depth inferior.}
    \item{tmg}{third moment global.}
    \item{tmt}{third moment temporal.}
    \item{tms}{third moment superior.}
    \item{tmn}{third moment nasal.}
    \item{tmi}{third moment inferior.}
    \item{mr}{mean radius.}
    \item{rnf}{retinal nerve fiber thickness.}
    \item{mdic}{mean depth in contour.}
    \item{emd}{effective mean depth.}
    \item{mv}{mean variability.}
    \item{Class}{a factor with levels \code{glaucoma} and \code{normal}.}
  }
}
\details{
  All variables are derived from a laser scanning image of the eye background
taken by the Heidelberg Retina Tomograph. Most of the variables describe
either the area or volume in certain parts of the papilla and are measured
in four sectors (temporal, superior, nasal and inferior) as well as for the
whole papilla (global). The global measurement is, roughly, the sum of the
measurements taken in the four sector.

  The observations in both groups are matched by age and sex to prevent any
bias.
}
\source{
 Torsten Hothorn and Berthold Lausen (2003), Double-Bagging: Combining
classifiers by bootstrap aggregation. \emph{Pattern Recognition},
\bold{36}(6), 1303--1309. 

}

\note{
 \code{GLaucomM} overlaps in some parts with \code{\link{GlaucomaMVF}}.
}

\examples{
data(GlaucomaM)
errorest(Class ~ ., data=GlaucomaM, model=rpart, 
         predict=function(obj, newdata) 
                   predict(obj, newdata, type="class"), 
         control=rpart.control(xval=0))
glbagg <- bagging(Class ~ ., data=GlaucomaM, coob=TRUE)
glbagg

}
\keyword{datasets}
\eof
\name{GlaucomaMVF}
\alias{GlaucomaMVF}
\non_function{}
\title{ Glaucoma Database }
\usage{data(GlaucomaMVF)}
\description{
The \code{GlaucomaMVF} data has 170 observations in two classes. 
66 predictors are derived from a confocal laser scanning image of the 
optic nerve head, from a visual field test, a fundus photography and a 
measurement of the intra occular pressure. 
}
\format{
  This data frame contains the following predictors describing the
  morphology of the optic nerve head, the visual field, the intra 
  occular pressure and a membership variable:
  \describe{
    \item{ag}{area global.}
    \item{at}{area temporal.}
    \item{as}{area superior.}
    \item{an}{area nasal.}
    \item{ai}{area inferior.}
    \item{eag}{effective area global.}
    \item{eat}{effective area temporal.}
    \item{eas}{effective area superior.}
    \item{ean}{effective area nasal.}
    \item{eai}{effective area inferior.}
    \item{abrg}{area below reference global.}
    \item{abrt}{area below reference temporal.}
    \item{abrs}{area below reference superior.}
    \item{abrn}{area below reference nasal.}
    \item{abri}{area below reference inferior.}
    \item{hic}{height in contour.}
    \item{mhcg}{mean height contour global.}
    \item{mhct}{mean height contour temporal.}
    \item{mhcs}{mean height contour superior.}
    \item{mhcn}{mean height contour nasal.}
    \item{mhci}{mean height contour inferior.}
    \item{phcg}{peak height contour.}
    \item{phct}{peak height contour temporal.}
    \item{phcs}{peak height contour superior.}
    \item{phcn}{peak height contour nasal.}
    \item{phci}{peak height contour inferior.}
    \item{hvc}{height variation contour.}
    \item{vbsg}{volume below surface global.}
    \item{vbst}{volume below surface temporal.}
    \item{vbss}{volume below surface superior.}
    \item{vbsn}{volume below surface nasal.}
    \item{vbsi}{volume below surface inferior.}
    \item{vasg}{volume above surface global.}
    \item{vast}{volume above surface temporal.}
    \item{vass}{volume above surface superior.}
    \item{vasn}{volume above surface nasal.}
    \item{vasi}{volume above surface inferior.}
    \item{vbrg}{volume below reference global.}
    \item{vbrt}{volume below reference temporal.}
    \item{vbrs}{volume below reference superior.}
    \item{vbrn}{volume below reference nasal.}
    \item{vbri}{volume below reference inferior.}
    \item{varg}{volume above reference global.}
    \item{vart}{volume above reference temporal.}
    \item{vars}{volume above reference superior.}
    \item{varn}{volume above reference nasal.}
    \item{vari}{volume above reference inferior.}
    \item{mdg}{mean depth global.}
    \item{mdt}{mean depth temporal.}
    \item{mds}{mean depth superior.}
    \item{mdn}{mean depth nasal.}
    \item{mdi}{mean depth inferior.}
    \item{tmg}{third moment global.}
    \item{tmt}{third moment temporal.}
    \item{tms}{third moment superior.}
    \item{tmn}{third moment nasal.}
    \item{tmi}{third moment inferior.}
    \item{mr}{mean radius.}
    \item{rnf}{retinal nerve fiber thickness.}
    \item{mdic}{mean depth in contour.}
    \item{emd}{effective mean depth.}
    \item{mv}{mean variability.}
    \item{tension}{intra occular pressure.}
    \item{clv}{corrected loss variance, variability of the visual field.}
    \item{cs}{contrast sensitivity of the visual field.}
    \item{lora}{loss of rim area, measured by fundus photography.}
    \item{Class}{a factor with levels \code{glaucoma} and \code{normal}.}
  }
}
\details{
Confocal laser images of the eye background are taken with the 
Heidelberg Retina Tomograph and variables 1-62 are derived. 
Most of these variables describe either the area or volume in 
certain parts of the papilla and are measured in 
four sectors (temporal, superior, nasal and inferior) as well 
as for the whole papilla (global). The global measurement is, 
roughly, the sum of the measurements taken in the four sector.

The perimeter `Octopus' measures the visual field variables \code{clv} 
and \code{cs}, stereo optic disks photographs were taken with a 
telecentric fundus camera and \code{lora} is derived.

Observations of both groups are matched by age and sex, 
to prevent for possible confounding. 

}
\source{
  Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2002), 
  Diagnosis of glaucoma by indirect classifiers, 
  \emph{Methods of Information in Medicine}, to appear.
}

\note{
 \code{GLaucomMVF} overlaps in some parts with \code{\link{GlaucomaM}}.
}

\examples{
data(GlaucomaMVF)

response <- function (data) {
  attach(data)
  res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >= 
        49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) & 
        clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) & 
        !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) | 
        (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
  detach(data)
  factor (res, labels = c("glaucoma", "normal"))
}

errorest(Class~clv+lora+cs~., data = GlaucomaMVF, model=inclass, 
       estimator="cv", pFUN = list(list(model = rpart)), cFUN = response)

}
\keyword{datasets}
\eof
\name{Smoking}
\alias{Smoking}
\non_function{}
\title{Smoking Styles}
\usage{data(Smoking)}
\description{
The \code{Smoking} data frame has 55 rows and 9 columns.
}
\format{
  This data frame contains the following columns:
  \describe{
    \item{NR}{numeric, patient number.}
    \item{Sex}{factor, sex of patient.}
    \item{Age}{factor, age group of patient, grouping consisting of those in their twenties, those in their thirties and so on.}
    \item{TarY}{numeric, tar yields of the cigarettes.}
    \item{NicY}{numeric, nicotine yields of the cigarettes.}
    \item{COY}{numeric, carbon monoxide (CO) yield of the cigarettes.}
    \item{TVPS}{numeric, total volume puffed smoke.}
    \item{BPNL}{numeric, blood plasma nicotine level.}
    \item{COHB}{numeric, carboxyhaemoglobin level, i.e. amount of CO absorbed by the blood stream.}
  }
}

\details{
 The data describes different smoking habits of probands.
}

\source{
 Hand and Taylor (1987), Study F \emph{Smoking Styles}.
}

\references{
D.J. Hand and C.C. Taylor (1987), 
\emph{Multivariate analysis of variance and repeated measures.} London: Chapman \&
Hall, pp. 167--181.
}

\keyword{datasets}
\eof
\name{bagging}
\alias{bagging}
\alias{ipredbagg}
\alias{ipredbagg.factor}
\alias{ipredbagg.integer}
\alias{ipredbagg.numeric}
\alias{ipredbagg.Surv}
\alias{ipredbagg.default}
\alias{bagging.data.frame}
\alias{bagging.default}
\title{Bagging Classification, Regression and Survival Trees }
\description{
  Bagging for classification, regression and survival trees.
}
\usage{
ipredbagg.factor(y, X=NULL, nbagg=25, control=
                 rpart.control(minsplit=2, cp=0, xval=0), 
                 comb=NULL, coob=FALSE, ns=length(y), keepX = TRUE, \dots)
ipredbagg.numeric(y, X=NULL, nbagg=25, control=rpart.control(xval=0), 
                  comb=NULL, coob=FALSE, ns=length(y), keepX = TRUE, \dots)
ipredbagg.Surv(y, X=NULL, nbagg=25, control=rpart.control(xval=0), 
               comb=NULL, coob=FALSE, ns=dim(y)[1], keepX = TRUE, \dots)
\method{bagging}{data.frame}(formula, data, subset, na.action=na.rpart, \dots)
}
\arguments{
  \item{y}{the response variable: either a factor vector of class labels
           (bagging classification trees), a vector of numerical values 
           (bagging regression trees) or an object of class 
           \code{\link[survival]{Surv}} (bagging survival trees).}
  \item{X}{a data frame of predictor variables.}
  \item{nbagg}{an integer giving the number of bootstrap replications. }
  \item{coob}{a logical indicating whether an out-of-bag estimate of the
              error rate (misclassification error, root mean squared error
              or Brier score) should be computed. 
              See \code{\link{predict.classbagg}} for
              details.}
  \item{control}{options that control details of the \code{rpart}
                algorithm, see \code{\link[rpart]{rpart.control}}. It is
                wise to set \code{xval = 0} in order to save computing 
                time. Note that the 
                default values depend on the class of \code{y}.}
  \item{comb}{a list of additional models for model combination, see below
for some examples. Note that argument \code{method} for double-bagging is no longer there, 
\code{comb} is much more flexible.}
  \item{ns}{number of sample to draw from the learning sample. By default,
            the usual bootstrap n out of n with replacement is performed. 
            If \code{ns} is smaller than \code{length(y)}, subagging
            (Buehlmann and Yu, 2002), i.e. sampling \code{ns} out of
            \code{length(y)} without replacement, is performed.}
  \item{keepX}{a logical indicating whether the data frame of predictors
    should be returned. Note that the computation of the 
    out-of-bag estimator requires  \code{keepX=TRUE}.}
  \item{formula}{a formula of the form \code{lhs ~ rhs} where \code{lhs} 
                 is the response variable and \code{rhs} a set of
                 predictors.}
  \item{data}{optional data frame containing the variables in the
              model formula.} 
  \item{subset}{optional vector specifying a subset of observations
                to be used.}
  \item{na.action}{function which indicates what should happen when
                   the data contain \code{NA}s.  Defaults to
                   \code{\link[rpart]{na.rpart}}.}
  \item{...}{additional parameters passed to \code{ipredbagg} or 
\code{\link[rpart]{rpart}}, respectively.}
}

\details{
  Bagging for classification and regression trees were suggested by
Breiman (1996a, 1998) in order to stabilise trees. 

The trees in this function are computed using the implementation in the 
\code{\link[rpart]{rpart}} package. The generic function \code{ipredbagg}
implements methods for different responses. If \code{y} is a factor,
classification trees are constructed. For numerical vectors
\code{y}, regression trees are aggregated and if \code{y} is a survival 
object, bagging survival trees (Hothorn et al, 2003) is performed. 
The function \code{bagging} offers a formula based interface to
\code{ipredbagg}.

\code{nbagg} bootstrap samples are drawn and a tree is constructed 
for each of them. There is no general rule when to stop the tree 
growing. The size of the
trees can be controlled by \code{control} argument 
or \code{\link{prune.classbagg}}. By
default, classification trees are as large as possible whereas regression
trees and survival trees are build with the standard options of
\code{\link[rpart]{rpart.control}}. If \code{nbagg=1}, one single tree is
computed for the whole learning sample without bootstrapping.

If \code{coob} is TRUE, the out-of-bag sample (Breiman,
1996b) is used to estimate the prediction error 
corresponding to \code{class(y)}. Alternatively, the out-of-bag sample can
be used for model combination, an out-of-bag error rate estimator is not 
available in this case. Double-bagging (Hothorn and Lausen,
2003) computes a LDA on the out-of-bag sample and uses the discriminant
variables as additional predictors for the classification trees. \code{comb}
is an optional list of lists with two elements \code{model} and \code{predict}. 
\code{model} is a function with arguments \code{formula} and \code{data}. 
\code{predict} is a function with arguments \code{object, newdata} only. If
the estimation of the covariance matrix in \code{\link{lda}} fails due to a
limited out-of-bag sample size, one can use \code{\link{slda}} instead.
See the example section for an example of double-bagging. The methodology is
not limited to a combination with LDA: bundling (Hothorn and Lausen, 2002b) 
can be used with arbitrary classifiers.

}

\value{
  The class of the object returned depends on \code{class(y)}:
\code{classbagg, regbagg} and \code{survbagg}. Each is a list with elements
\item{y}{the vector of responses.}
\item{X}{the data frame of predictors.}
\item{mtrees}{multiple trees: a list of length \code{nbagg} containing the
trees (and possibly additional objects) for each bootstrap sample.}
\item{OOB}{logical whether the out-of-bag estimate should be computed.}
\item{err}{if \code{OOB=TRUE}, the out-of-bag estimate of
misclassification or root mean squared error or the Brier score for censored
data.}
\item{comb}{logical whether a combination of models was requested.}

 For each class methods for the generics \code{\link[rpart]{prune}}, 
\code{\link{print}}, \code{\link{summary}} and \code{\link{predict}} are
available for inspection of the results and prediction, for example:
\code{\link{print.classbagg}}, \code{\link{summary.classbagg}}, 
\code{\link{predict.classbagg}}  and \code{\link{prune.classbagg}} for
classification problems.

}

\references{ 

Leo Breiman (1996a), Bagging Predictors. \emph{Machine Learning}
\bold{24}(2), 123--140.

Leo Breiman (1996b), Out-Of-Bag Estimation. \emph{Technical Report}
\url{ftp://ftp.stat.berkeley.edu/pub/users/breiman/OOBestimation.ps.Z}.

Leo Breiman (1998), Arcing Classifiers. \emph{The Annals of Statistics}
\bold{26}(3), 801--824.

Peter Buehlmann and Bin Yu (2002), Analyzing Bagging. \emph{The Annals of
Statistics} \bold{30}(4), 927--961.

Torsten Hothorn and Berthold Lausen (2003), Double-Bagging: Combining
classifiers by bootstrap aggregation. \emph{Pattern Recognition},
\bold{36}(6), 1303--1309. 

Torsten Hothorn and Berthold Lausen (2002b), Bundling Classifiers by Bagging
Trees. \emph{submitted}.
Preprint available from 
\url{http://www.mathpreprints.com/math/Preprint/blausen/20021016/1}.

Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
Radespiel-Troeger (2003), Bagging Survival Trees. \emph{Statistics in
Medicine} (accepted).
Preprint available from
\url{http://www.mathpreprints.com/math/Preprint/blausen/20020518/2}.

}

\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\examples{

# Classification: Breast Cancer data

data(BreastCancer)

# Test set error bagging (nbagg = 50): 3.7\% (Breiman, 1998, Table 5)

mod <- bagging(Class ~ Cl.thickness + Cell.size
                + Cell.shape + Marg.adhesion   
                + Epith.c.size + Bare.nuclei   
                + Bl.cromatin + Normal.nucleoli
                + Mitoses, data=BreastCancer, coob=TRUE)
print(mod)

# Test set error bagging (nbagg=50): 7.9\% (Breiman, 1996a, Table 2)

data(Ionosphere)
Ionosphere$V2 <- NULL # constant within groups

bagging(Class ~ ., data=Ionosphere, coob=TRUE)

# Double-Bagging: combine LDA and classification trees

# predict returns the linear discriminant values, i.e. linear combinations
# of the original predictors

comb.lda <- list(list(model=lda, predict=function(obj, newdata)
                                 predict(obj, newdata)$x))

# Note: out-of-bag estimator is not available in this situation, use
# errorest

mod <- bagging(Class ~ ., data=Ionosphere, comb=comb.lda) 

predict(mod, Ionosphere[1:10,])

# Regression:


data(BostonHousing)

# Test set error (nbagg=25, trees pruned): 3.41 (Breiman, 1996a, Table 8)

mod <- bagging(medv ~ ., data=BostonHousing, coob=TRUE)
print(mod)

learn <- as.data.frame(mlbench.friedman1(200))

# Test set error (nbagg=25, trees pruned): 2.47 (Breiman, 1996a, Table 8)

mod <- bagging(y ~ ., data=learn, coob=TRUE)
print(mod)

# Survival data

# Brier score for censored data estimated by 
# 10 times 10-fold cross-validation: 0.2 (Hothorn et al,
# 2002)

data(DLBCL)
mod <- bagging(Surv(time,cens) ~ MGEc.1 + MGEc.2 + MGEc.3 + MGEc.4 + MGEc.5 +
                                 MGEc.6 + MGEc.7 + MGEc.8 + MGEc.9 +
                                 MGEc.10 + IPI, data=DLBCL, coob=TRUE)

print(mod)


}
\keyword{tree}
\eof
\name{bootest}
\alias{bootest}
\alias{bootest.default}
\alias{bootest.factor}
\alias{bootest.numeric}
\alias{bootest.integer}
\alias{bootest.Surv}
\title{Bootstrap Error Rate Estimators}
\description{
  Those functions are low-level functions used by \code{\link{errorest}} and
are normally not called by users.
}
\usage{
\method{bootest}{factor}(y, formula, data, model, predict, nboot=25, bc632plus=FALSE, \dots)
}
\arguments{
  \item{y}{the response variable, either of class \code{factor}
(classification), \code{numeric} (regression) or \code{Surv} (survival).}
  \item{formula}{a formula object.}
  \item{data}{data frame of predictors and response described in
\code{formula}.}   
  \item{model}{a function implementing the predictive model to be
evaluated. The function \code{model} can either return an
               object representing a fitted model or a function with
               argument \code{newdata} which returns predicted values. In
               this case, the \code{predict} argument to \code{errorest} is
               ignored.}
  \item{predict}{a function with arguments \code{object} and \code{newdata}
only which predicts the status of the observations in \code{newdata} based
on the fitted model in \code{object}.}
  \item{nboot}{number of bootstrap replications to be used.}
  \item{bc632plus}{logical. Should the bias corrected version of misclassification
error be computed?}
  \item{\dots}{additional arguments to \code{model}.}
}

\details{
 See \code{\link{errorest}}.
}
\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\keyword{misc}
\eof
\name{control.errorest}
\alias{control.errorest}
\title{ Control Error Rate Estimators }
\description{
  Some parameters that control the behaviour of \code{\link{errorest}}.
}
\usage{
control.errorest(k = 10, nboot = 25, strat = FALSE, random = TRUE, 
                 predictions = FALSE, getmodels=FALSE)
}
\arguments{
  \item{k}{integer, specify $k$ for $k$-fold cross-validation.}
  \item{nboot}{integer, number of bootstrap replications.}
  \item{strat}{logical, if \code{TRUE}, cross-validation is performed 
               using stratified sampling (for classification problems).}
  \item{random}{logical, if \code{TRUE}, cross-validation is performed using
                a random ordering of the data.}
  \item{predictions}{logical, indicates whether the prediction
                     for each observation should be returned or not
                     (classification and regression only). }
  \item{getmodels}{logical, indicates a list of all models should be
                   returned. For cross-validation only.}
}
\value{
  A list with the same components as arguments. 
}
\keyword{misc}
\eof
\name{cv}
\alias{cv}
\alias{cv.default}
\alias{cv.factor}
\alias{cv.numeric}
\alias{cv.integer}
\alias{cv.Surv}
\title{Cross-validated Error Rate Estimators.}
\description{
  Those functions are low-level functions used by \code{\link{errorest}} and
are normally not called by users.
}
\usage{
\method{cv}{factor}(y, formula, data, model, predict, k=10, random=TRUE, 
            strat=FALSE,
            predictions=NULL, getmodels=NULL, \dots) 
}
\arguments{
  \item{y}{response variable, either of class \code{factor}
(classification), \code{numeric} (regression) or \code{Surv} (survival).}
  \item{formula}{a formula object.}
  \item{data}{data frame of predictors and response described in \code{formula}.}
  \item{model}{a function implementing the predictive model to be
evaluated. The function \code{model} can either return an
               object representing a fitted model or a function with
               argument \code{newdata} which returns predicted values. In
               this case, the \code{predict} argument to \code{errorest} is
               ignored.}
  \item{predict}{a function with arguments \code{object} and \code{newdata}
only which predicts the status of the observations in \code{newdata} based
on the fitted model in \code{object}.}
  \item{k}{k-fold cross-validation.}
  \item{random}{logical, indicates whether a random order or the given
order of the data should be used for sample splitting or not, defaults to
\code{TRUE}.}
  \item{strat}{logical, stratified sampling or not, defaults to \code{FALSE}.}
  \item{predictions}{logical, return the prediction of each observation.}
  \item{getmodels}{logical, return a list of models for each fold.}
  \item{\dots}{additional arguments to \code{model}.}
}
\details{
 See \code{\link{errorest}}.
}
\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\keyword{misc}
\eof
\name{dystrophy}
\alias{dystrophy}
\non_function{}
\title{Detection of muscular dystrophy carriers.}
\usage{data(dystrophy)}
\description{
The \code{dystrophy} data frame has 209 rows and 10 columns.
}
\format{
  This data frame contains the following columns:
  \describe{
    \item{OBS}{numeric. Observation number.}
    \item{HospID}{numeric. Hospital ID number.}
    \item{AGE}{numeric, age in years.}
    \item{M}{numeric. Month of examination.}
    \item{Y}{numeric. Year of examination.}
    \item{CK}{numeric. Serum marker creatine kinase.}
    \item{H}{numeric. Serum marker hemopexin.}
    \item{PK}{numeric. Serum marker pyruvate kinase.}
    \item{LD}{numeric. Serum marker lactate dehydroginase.}
    \item{Class}{factor with levels, \code{carrier} and \code{normal}.}
  }
}

\details{
Duchenne Muscular Dystrophy (DMD) is a genetically transmitted disease,
passed from a mother to her children. Affected female offspring usually suffer
no apparent symptoms, male offspring with the disease die at young age.
Although female carriers have no physical symptoms they tend to exhibit
elevated levels of certain serum enzymes or proteins.
\cr
The dystrophy dataset contains 209 observations of 75 female DMD carriers and
134 female DMD non-carrier. It includes 6 variables describing age of the
female and the serum parameters serum marker creatine kinase (CK), serum marker
 hemopexin (H), serum marker pyruvate kinase (PK) and serum marker lactate
dehydroginase (LD). The serum markers CK and H may be measured rather
inexpensive from frozen serum, PK and LD requires fresh serum.
}

\source{
D.Andrews and A. Herzberg (1985), Data. Berlin: Springer-Verlag.
}

\references{
Robert Tibshirani and Geoffry Hinton (1998), Coaching variables for regression and classification. Statistics and Computing 8, 25-33.
}

\examples{
data(dystrophy)
errorest(Class~CK+H~AGE+PK+LD, data = dystrophy, model = inbagg, 
pFUN = list(list(model = lm, predict = mypredict.lm), list(model = rpart)), 
ns = 0.75, estimator = "cv")
}

\keyword{datasets}
\eof
\name{errorest}
\alias{errorest}
\alias{errorest.data.frame}
\alias{errorest.default}
\title{ Estimators of Prediction Error }
\description{
Resampling based estimates of prediction error: misclassification error, 
root mean squared error or Brier score for survival data.
}
\usage{
\method{errorest}{data.frame}(formula, data, subset, na.action=na.omit, 
         model=NULL, predict=NULL,
         estimator=c("cv", "boot", "632plus"), 
         est.para=control.errorest(), ...)
}

\arguments{
  \item{formula}{a formula of the form \code{lhs ~ rhs}. 
                 Either describing the model of explanatory and 
                 response variables in the usual way (see \code{\link{lm}}) 
                 or the model between explanatory and intermediate variables
                 in the framework of indirect classification, 
                 see \code{\link{inclass}}.}
  \item{data}{a data frame containing the variables in the model formula 
              and additionally the class membership variable 
              if \code{model = inclass}. \code{data} is required for
              indirect classification, otherwise \code{formula} is evaluated
              in the calling environment.}
  \item{subset}{optional vector, specifying a subset of observations to 
                be used.}
  \item{na.action}{function which indicates what should happen when the data
                   contains \code{NA}'s, defaults to \code{\link{na.omit}}.} 
  \item{model}{function. Modelling technique whose error rate is to be 
               estimated. The function \code{model} can either return an 
               object representing a fitted model or a function with
               argument \code{newdata} which returns predicted values. In
               this case, the \code{predict} argument to \code{errorest} is
               ignored.}
  \item{predict}{function. Prediction method to be used. The vector of 
                 predicted values must have the same length as the the 
                 number of to-be-predicted observations. Predictions 
                 corresponding to missing data must be replaced by \code{NA}.
                 Additionally, \code{predict} has to return predicted values 
                 comparable to the responses (that is: factors for 
                 classification problems). See the example on how to make
                 this sure for any predictor.}
  \item{estimator}{estimator of the misclassification error: 
                   \code{cv} cross-validation, \code{boot} bootstrap or 
                   \code{632plus} bias corrected bootstrap (classification
                   only). }
  \item{est.para}{a list of additional parameters that control the
                  calculation of the estimator, see 
                  \code{\link{control.errorest}} for details.}
  \item{\dots}{additional parameters to \code{model}.}
}
\details{
  The prediction error for classification and regression models as well as
predictive models for censored data using cross-validation or the 
bootstrap can be computed by \code{errorest}. For classification problems,
the estimated misclassification error is returned. The root mean squared
error is computed for regression problems and the Brier score for censored
data (Graf et al., 1999) is reported if the response is censored. 

Any model can be specified as long as it is a function with arguments
\code{model(formula, data, subset, na.action, ...)}. If 
a method \code{predict.model(object, newdata, ...)} is available,
\code{predict} does not need to be specified. However, \code{predict} 
has to return predicted values in the same order and of the same length
corresponding to the response. See the examples below. 

$k$-fold cross-validation and the usual bootstrap estimator with
\code{est.para$nboot} bootstrap replications can be computed for
all kind of problems. The bias corrected .632+ bootstrap
by Efron and Tibshirani (1997) is available for classification problems
only. Use \code{\link{control.errorest}} to specify additional arguments.

\code{errorest} is a formula based interface to the generic functions 
\code{\link{cv}} or \code{\link{bootest}} which implement methods for
classification, regression and survival problems.
}

\value{
 The class of the object returned depends on the class of the response
variable and the estimator used. In each case, it is a list with an element
\code{error} and additional information. \code{print} methods are available
for the inspection of the results.
}

\references{

Brian D. Ripley (1996), \emph{Pattern Recognition and Neural Networks}.
Cambridge: Cambridge University Press.

Bradley Efron and Robert Tibshirani (1997),
Improvements on Cross-Validation: The .632+ Bootstrap Estimator.
\emph{Journal of the American Statistical Association} \bold{92}(438),
548--560.

Erika Graf, Claudia Schmoor, Willi Sauerbrei and Martin Schumacher (1999), 
Assessment and comparison of prognostic classification schemes for 
survival data. \emph{Statistics in Medicine} \bold{18}(17-18), 2529--2545.

Rosa A. Schiavo and David J. Hand (2000), Ten More Years of Error Rate
Research. \emph{International Statistical Review} \bold{68}(3), 296-310. 

David J. Hand, Hua Gui Li, Niall M. Adams (2001),
Supervised Classification with Structured Class Definitions.
\emph{Computational Statistics \& Data Analysis} \bold{36},
209--225.


}
\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de> and \cr
  Torsten Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de>}
\examples{

# Classification

data(iris)

# force predict to return class labels only
mypredict.lda <- function(object, newdata)
  predict(object, newdata = newdata)$class

# 10-fold cv of LDA for Iris data
errorest(Species ~ ., data=iris, model=lda, 
         estimator = "cv", predict= mypredict.lda)

data(PimaIndiansDiabetes)

# 632+ bootstrap of LDA for Diabetes data
errorest(diabetes ~ ., data=PimaIndiansDiabetes, model=lda,
         estimator = "632plus", predict= mypredict.lda)


data(Glass)

# LDA has cross-validated misclassification error of
# 38\% (Ripley, 1996, page 98)

# Pruned trees about 32\% (Ripley, 1996, page 230)

# use stratified sampling here, i.e. preserve the class proportions
errorest(Type ~ ., data=Glass, model=lda, 
         predict=mypredict.lda, est.para=control.errorest(strat=TRUE))


# force predict to return class labels
mypredict.rpart <- function(object, newdata)
  predict(object, newdata = newdata,type="class")

pruneit <- function(formula, ...)
  prune(rpart(formula, ...), cp =0.01)

errorest(Type ~ ., data=Glass, model=pruneit,
         predict=mypredict.rpart, est.para=control.errorest(strat=TRUE))

# compute sensitivity and specifity for stabilised LDA

data(GlaucomaM)

error <- errorest(Class ~ ., data=GlaucomaM, model=slda,
  predict=mypredict.lda, est.para=control.errorest(predictions=TRUE))

# sensitivity 

mean(error$predictions[GlaucomaM$Class == "glaucoma"] == "glaucoma")

# specifity

mean(error$predictions[GlaucomaM$Class == "normal"] == "normal")

# Indirect Classification: Smoking data

data(Smoking)
# Set three groups of variables:
# 1) explanatory variables are: TarY, NicY, COY, Sex, Age
# 2) intermediate variables are: TVPS, BPNL, COHB
# 3) response (resp) is defined by:

resp <- function(data){
  data <- data[, c("TVPS", "BPNL", "COHB")]
  res <- t(t(data) > c(4438, 232.5, 58))
  res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
  res
}

response <- resp(Smoking[ ,c("TVPS", "BPNL", "COHB")])
smoking <- cbind(Smoking, response)

formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age

# Estimation per leave-one-out estimate for the misclassification is 
# 36.36\% (Hand et al., 2001), using indirect classification with 
# linear models

errorest(formula, data = smoking, model = inclass,estimator = "cv", 
         pFUN = list(list(model=lm, predict = mypredict.lm)), cFUN = resp,  
         est.para=control.errorest(k=nrow(smoking)))


# Regression

data(BostonHousing)

# 10-fold cv of lm for Boston Housing data
errorest(medv ~ ., data=BostonHousing, model=lm,
         est.para=control.errorest(random=FALSE))

# the same, with "model" returning a function for prediction
# instead of an object of class "lm"

mylm <- function(formula, data) {
  mod <- lm(formula, data)
  function(newdata) predict(mod, newdata)
}

errorest(medv ~ ., data=BostonHousing, model=mylm,
est.para=control.errorest(random=FALSE))


# Survival data

data(GBSG2)

# prediction is fitted Kaplan-Meier
predict.survfit <- function(object, newdata) object

# 5-fold cv of Kaplan-Meier for GBSG2 study
errorest(Surv(time, cens) ~ 1, data=GBSG2, model=survfit,
         predict=predict.survfit, est.para=control.errorest(k=5))


}

\keyword{misc}
\eof
\name{inbagg}
\alias{inbagg}
\alias{inbagg.default}
\alias{inbagg.data.frame}

\title{Indirect Bagging}

\description{
 Function to perform the indirect bagging and subagging.
}

\usage{
inbagg.data.frame(formula, data, pFUN=NULL, 
  cFUN=list(model = NULL, predict = NULL, training.set = NULL), 
  nbagg = 25, ns = 0.5, replace = FALSE, ...)
}

\arguments{
  \item{formula}{formula. A \code{formula} specified as \code{y~w1+w2+w3~x1+x2+x3} describes how to model the intermediate variables \code{w1, w2, w3} and the response variable \code{y}, if no other formula is specified by the elements of \code{pFUN} or in \code{cFUN}}
  \item{data}{data frame of explanatory, intermediate and response variables.}
  \item{pFUN}{list of lists, which describe models for the intermediate variables, details are given below.}
  \item{cFUN}{either a fixed function with argument \code{newdata} and returning the class membership by default, or a list specifying a classifying model, similar to one element of \code{pFUN}. Details are given below.}
  \item{nbagg}{number of bootstrap samples.}
  \item{ns}{proportion of sample to be drawn from the learning
    sample. By default, subagging with 50\% is performed, i.e. draw
    0.5*n out of n without replacement.}
  \item{replace}{logical. Draw with or without replacement.}
  \item{\dots}{additional arguments (e.g. \code{subset}).}
}
\details{
 A given data set is subdivided into three types of variables: explanatory, intermediate and response variables.\cr

 Here, each specified intermediate variable is modelled separately
 following \code{pFUN}, a list of lists with elements specifying an
arbitrary number of models for the intermediate variables and an
optional element \code{training.set = c("oob", "bag", "all")}. The
element \code{training.set} determines whether, predictive models for
the intermediate are calculated based on the out-of-bag sample
(\code{"oob"}), the default, on the bag sample (\code{"bag"}) or on all
available observations (\code{"all"}). The elements of \code{pFUN},
specifying the models for the intermediate variables are lists as
described in \code{\link{inclass}}.
Note that, if no formula is given in these elements, the functional
relationship of \code{formula} is used.\cr

The response variable is modelled following \code{cFUN}.
This can either be a fixed classifying function as described in Peters
et al. (2003) or a list,
which specifies the  modelling technique to be applied. The list
contains the arguments \code{model} (which model to be fitted),
\code{predict} (optional, how to predict), \code{formula} (optional, of
type \code{y~w1+w2+w3+x1+x2} determines the variables the classifying
function is based on) and the optional argument \code{training.set =
  c("fitted.bag", "original", "fitted.subset")}
specifying whether the classifying function is trained on the predicted
observations of the bag sample (\code{"fitted.bag"}),
on the original observations (\code{"original"}) or on the
predicted observations not included in a defined subset
(\code{"fitted.subset"}). Per default the formula specified in
\code{formula} determines the variables, the classifying function is
based on.\cr

Note that the default of \code{cFUN = list(model = NULL, training.set = "fitted.bag")}
uses the function \code{\link[rpart]{rpart}} and
the predict function \code{predict(object, newdata, type = "class")}.
}
\value{
  An object of class \code{"inbagg"}, that is a list with elements
  \item{mtrees}{a list of length \code{nbagg}, describing the prediction
    models corresponding
    to each bootstrap sample. Each element of \code{mtrees}
    is a list with elements \code{bindx} (observations of bag sample),
    \code{btree} (classifying function of bag sample) and \code{bfct} (predictive models for intermediates of bag sample).}
  \item{y}{vector of response values.}
  \item{W}{data frame of intermediate variables.}
  \item{X}{data frame of explanatory variables.}
}

\references{
David J. Hand, Hua Gui Li, Niall M. Adams (2001),
Supervised classification with structured class definitions.
\emph{Computational Statistics \& Data Analysis} \bold{36},
209--225.

Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
Diagnosis of glaucoma by indirect classifiers.
\emph{Methods of Information in Medicine} \bold{1}, 99-103.
}

\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}

\seealso{\code{\link[rpart]{rpart}}, \code{\link{bagging}},
\code{\link{lm}}}


\examples{

library(mvtnorm)
y <- as.factor(sample(1:2, 100, replace = TRUE))
W <- rmvnorm(200, mean = rep(0, 3))

X <- rmvnorm(200, mean = rep(2, 3))
colnames(W) <- c("w1", "w2", "w3") 
colnames(X) <- c("x1", "x2", "x3") 
DATA <- data.frame(y, W, X)


pFUN <- list(list(formula = w1~x1+x2, model = lm, predict = mypredict.lm),
list(model = rpart))

inbagg(y~w1+w2+w3~x1+x2+x3, data = DATA, pFUN = pFUN)
}
\keyword{misc}
\eof
\name{inclass}
\alias{inclass}
\alias{inclass.default}
\alias{inclass.data.frame}

\title{Indirect Classification}

\description{
A framework for the indirect classification approach.
}

\usage{
\method{inclass}{data.frame}(formula, data, pFUN = NULL, cFUN = NULL, ...)
}
\arguments{
  \item{formula}{formula. A \code{formula} specified as
    \code{y~w1+w2+w3~x1+x2+x3} models each intermediate variable
    \code{w1, w2, w3} by \code{wi~x1+x2+x3} and the response by
    \code{y~w1+w2+w3} if no other formulas are given in \code{pFUN} or \code{cFUN}.}
  \item{data}{data frame of explanatory, intermediate and response variables.}
  \item{pFUN}{list of lists, which describe models for the intermediate variables, see below for details.}
  \item{cFUN}{either a function or a list which describes the model for the
response variable. The function has the argument \code{newdata} only.}
  \item{\dots}{additional arguments, passed to model fitting of the
    response variable.} 
}

\details{
A given data set is subdivided into three types of variables: those to be
used predicting the class (explanatory variables) those to be used defining
the class (intermediate variables) and the class membership variable itself
(response variable). Intermediate variables are modelled based on the
explanatory variables, the class membership variable is defined on the
intermediate variables.\cr

Each specified intermediate variable is modelled separately 
following \code{pFUN} and a formula specified by \code{formula}.
\code{pFUN} is a list of lists, the maximum length of
\code{pFUN} is the number of intermediate variables. Each element of
\code{pFUN} is a list with elements:\cr
  \code{model} -  a function with arguments \code{formula} and
  \code{data}; \cr
  \code{predict} - an optional function with arguments \code{object, newdata} only, 
  if \code{predict} is not specified, the predict method of \code{model}
  is used; \cr
  \code{formula} - specifies the formula for the corresponding
  \code{model} (optional),
    the formula described in \code{y~w1+w2+w3~x1+x2+x3} is used if no other is
  specified.
\cr
  
The response is classified following \code{cFUN}, which is either a fixed
function or a list as described below. The determined function \code{cFUN} assigns the intermediate (and
explanatory) variables to a certain class membership, the list
\code{cFUN} has the elements \code{formula, model, predict} and
\code{training.set}. The elements \code{formula, model, predict} are
structured as described by \code{pFUN}, the described model is
trained on the original (intermediate variables) if \code{training.set="original"}
or if \code{training.set = NULL}, on the fitted values if
\code{training.set = "fitted"} or on observations not included in a
specified subset if \code{training.set = "subset"}.
\cr 

A list of prediction models corresponding to each 
intermediate variable, a predictive function for the response, a list of
specifications for the intermediate and for the response are returned. \cr
For a detailed description on indirect
classification see Hand et al. (2001).
}
\value{
 An object of class \code{inclass}, consisting of a list of 
  \item{model.intermediate}{list of fitted models for each intermediate
variable.}
  \item{model.response}{predictive model for the response variable.}
  \item{para.intermediate}{list, where each element is again a list and specifies
the model for each intermediate variable.}
  \item{para.response}{a list which specifies the model for response variable.}
}
\references{
David J. Hand, Hua Gui Li, Niall M. Adams (2001),
Supervised classification with structured class definitions.
\emph{Computational Statistics \& Data Analysis} \bold{36},
209--225.

Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
Diagnosis of glaucoma by indirect classifiers.
\emph{Methods of Information in Medicine} \bold{1}, 99-103.
}

\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}

\seealso{\code{\link{bagging}}, \code{\link{inclass}}}

\examples{
data(Smoking)
# Set three groups of variables:
# 1) explanatory variables are: TarY, NicY, COY, Sex, Age
# 2) intermediate variables are: TVPS, BPNL, COHB
# 3) response (resp) is defined by:

classify <- function(data){
  data <- data[,c("TVPS", "BPNL", "COHB")]
  res <- t(t(data) > c(4438, 232.5, 58))
  res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
  res
}

response <- classify(Smoking[ ,c("TVPS", "BPNL", "COHB")])
smoking <- data.frame(Smoking, response)

formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age

inclass(formula, data = smoking, pFUN = list(list(model = lm, predict =
mypredict.lm)), cFUN = classify)

}

\keyword{misc}
\eof
\name{ipred-internal}
\alias{getsurv}
\title{Internal ipred functions}
\description{
 Internal ipred functions.
}
\usage{
getsurv(obj, times)
}
\details{
  This functions are not to be called by the user.
}
\keyword{internal}
\eof
\name{ipredknn}
\alias{ipredknn}
\title{ k-Nearest Neighbour Classification }
\description{
  $k$-nearest neighbour classification with an interface compatible to 
\code{\link{bagging}} and \code{\link{errorest}}.
}
\usage{
ipredknn(formula, data, subset, na.action, k=5, \dots)
}
\arguments{
  \item{formula}{a formula of the form \code{lhs ~ rhs} where \code{lhs} 
                 is the response variable and \code{rhs} a set of
                 predictors.}
  \item{data}{optional data frame containing the variables in the
              model formula.} 
  \item{subset}{optional vector specifying a subset of observations
                to be used.}
  \item{na.action}{function which indicates what should happen when
                   the data contain \code{NA}s.}
  \item{k}{number of neighbours considered, defaults to 5.}
  \item{...}{additional parameters.}
}

\details{
  This is a wrapper to \code{\link[class]{knn}} in order to be able to 
  use k-NN in \code{\link{bagging}} and \code{\link{errorest}}. 
}

\value{
  An object of class \code{ipredknn}. See \code{\link{predict.ipredknn}}.
}

\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }

\examples{

learn <- as.data.frame(mlbench.twonorm(300))

mypredict.knn <- function(object, newdata) 
                   predict.ipredknn(object, newdata, type="class")

errorest(classes ~., data=learn, model=ipredknn, 
         predict=mypredict.knn)


}
\keyword{multivariate}
\eof
\name{kfoldcv}
\alias{kfoldcv}
\title{ Subsamples for k-fold Cross-Validation }
\description{
  Computes feasible sample sizes for the k groups in k-fold cv if N/k is not
an integer.
}
\usage{
kfoldcv(k, N, nlevel=NULL)
}
\arguments{
  \item{k}{ number of groups. }
  \item{N}{ total sample size. }
  \item{nlevel}{ a vector of sample sizes for stratified sampling.}
}
\details{
  If N/k is not an integer, k-fold cv is not unique. Determine meaningful
  sample sizes.
}
\value{
  A vector of length \code{k}.
}
\author{ Torsten Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\examples{

# 10-fold CV with N = 91

kfoldcv(10, 91)	

\testonly{
k <- sample(5:15, 1)
k
N <- sample(50:150, 1)
N
stopifnot(sum(kfoldcv(k, N)) == N)
}

}
\keyword{misc}
\eof
\name{mypredict.lm}
\alias{mypredict.lm}
\title{Predictions Based on Linear Models}
\description{
Function to predict a vector of full length (number of observations), where predictions according to missing
explanatory values are replaced by \code{NA}.
}

\usage{
mypredict.lm(object, newdata)
}

\arguments{
  \item{object}{an object of class \code{lm}.}
  \item{newdata}{matrix or data frame to be predicted according to \code{object}.}
}

\value{
Vector of predicted values.
}

\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}

\note{\code{predict.lm} delivers a vector of reduced length, i.e. rows where
explanatory variables are missing are omitted. The full length of the
predicted observation vector is necessary in the indirect classification
approach (\code{\link{predict.inclass}}).}

\keyword{misc}
\eof
\name{predict.classbagg}
\alias{predict.classbagg}
\alias{predict.regbagg}
\alias{predict.survbagg}
\title{ Predictions from Bagging Trees }
\description{
  Predict the outcome of a new observation based on multiple trees.  
}
\usage{
predict.classbagg(object, newdata=NULL, type=c("class", "prob"),
                            aggregation=c("majority", "average", "weighted"), \dots)
predict.regbagg(object, newdata=NULL, aggregation=c("average",
                "weighted"), \dots)
predict.survbagg(object, newdata=NULL,\dots)
}
\arguments{
  \item{object}{object of classes \code{classbagg}, \code{regbagg} or
                \code{survbagg}.}
  \item{newdata}{a data frame of new observations. }
  \item{type}{character string denoting the type of predicted value
             returned for classification trees. Either \code{class} 
             (predicted classes are returned) or \code{prob} 
             (estimated class probabilities are returned).}
  \item{aggregation}{character string specifying how to aggregate, see below.}
  \item{...}{additional arguments, currently not passed to any function.}
}
\details{
  There are (at least) three different ways to aggregate the predictions of
bagging classification trees. Most famous is class majority voting
(\code{aggregation="majority"}) where the most frequent class is returned. The
second way is choosing the class with maximal averaged class probability
(\code{aggregation="average"}). The third method is based on the "aggregated learning
sample", introduced by Hothorn et al. (2003) for survival trees.
The prediction of a new observation is the majority class, mean or
Kaplan-Meier curve of all observations from the learning sample 
identified by the \code{nbagg} leaves containing the new observation.  
For regression trees, only averaged or weighted predictions are possible. 

By default, the out-of-bag estimate is computed if \code{newdata} is NOT
specified. Therefore, the predictions of \code{predict(object)} are "honest"
in some way (this is not possible for combined models via \code{comb} in
\code{\link{bagging}}). 
If you like to compute the predictions for the learning sample
itself, use \code{newdata} to specify your data. 

}
\value{
  The predicted class or estimated class probabilities are returned for
classification trees. The predicted endpoint is returned in regression
problems and the predicted Kaplan-Meier curve is returned for survival
trees. 
}

\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }

\references{ 

Leo Breiman (1996), Bagging Predictors. \emph{Machine Learning}
\bold{24}(2), 123--140.

Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
Radespiel-Troeger (2003), Bagging Survival Trees. \emph{Statistics in
Medicine} (accepted).
Preprint available from
\url{http://www.mathpreprints.com/math/Preprint/blausen/20020518/2}.
 


}

\examples{

data(Ionosphere)
Ionosphere$V2 <- NULL # constant within groups

# nbagg = 10 for performance reasons here
mod <- bagging(Class ~ ., data=Ionosphere)

# out-of-bag estimate

mean(predict(mod) != Ionosphere$Class)

# predictions for the first 10 observations

predict(mod, newdata=Ionosphere[1:10,])

predict(mod, newdata=Ionosphere[1:10,], type="prob")

}
\keyword{tree}
\eof
\name{predict.inbagg}
\alias{predict.inbagg}

\title{Predictions from an Inbagg Object}
\description{
Predicts the class membership of new observations through indirect
bagging.
}
\usage{
predict.inbagg(object, newdata, ...)
}
\arguments{
  \item{object}{object of class \code{inbagg}, see \code{\link{inbagg}}.}
  \item{newdata}{data frame to be classified.}
  \item{\dots}{additional argumends corresponding to the predictive models.}
}
\details{
Predictions of class memberships are calculated. i.e. values of the
intermediate variables are predicted following \code{pFUN} and classified following \code{cFUN},
see \code{\link{inbagg}}.
}
\value{
The vector of predicted classes is returned.
}

\references{
David J. Hand, Hua Gui Li, Niall M. Adams (2001),
Supervised classification with structured class definitions.
\emph{Computational Statistics \& Data Analysis} \bold{36}, 
209--225.

Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
Diagnosis of glaucoma by indirect classifiers.
\emph{Methods of Information in Medicine} \bold{1}, 99-103.
}        

\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}

\seealso{\code{\link{inbagg}}}

\examples{
library(mvtnorm)
y <- as.factor(sample(1:2, 100, replace = TRUE))
W <- rmvnorm(200, mean = rep(0, 3))

X <- rmvnorm(200, mean = rep(2, 3))
colnames(W) <- c("w1", "w2", "w3")
colnames(X) <- c("x1", "x2", "x3")
DATA <- data.frame(y, W, X)


pFUN <- list(list(formula = w1~x1+x2, model = lm),
list(model = rpart))

RES <- inbagg(y~w1+w2+w3~x1+x2+x3, data = DATA, pFUN = pFUN)
predict(RES, newdata = X)
}
\keyword{misc}
\eof
\name{predict.inclass}
\alias{predict.inclass}

\title{Predictions from an Inclass Object}

\description{
Predicts the class membership of new observations through indirect
classification.
}

\usage{
predict.inclass(object, newdata, ...)
}

\arguments{
  \item{object}{ object of class \code{inclass}, see \code{\link{inclass}}.}
  \item{newdata}{data frame to be classified.}
  \item{\dots}{additional arguments corresponding to the predictive models 
specified in \code{\link{inclass}}.}
}
\details{
Predictions of class memberships are calculated. i.e. values of the
intermediate variables are predicted and classified following \code{cFUN},
see \code{\link{inclass}}.
}
\value{
The vector of predicted classes is returned.
}
\references{
David J. Hand, Hua Gui Li, Niall M. Adams (2001),
Supervised classification with structured class definitions.
\emph{Computational Statistics \& Data Analysis} \bold{36},
209--225.

Andrea Peters, Berthold Lausen, Georg Michelson and Olaf Gefeller (2003),
Diagnosis of glaucoma by indirect classifiers.
\emph{Methods of Information in Medicine} \bold{1}, 99-103.
}

\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}

\seealso{\code{\link{inclass}}}

\examples{
# Simulation model, classification rule following Hand et al. (2001)

theta90 <- varset(N = 1000, sigma = 0.1, theta = 90, threshold = 0)

dataset <- as.data.frame(cbind(theta90$explanatory, theta90$intermediate))
names(dataset) <- c(colnames(theta90$explanatory),
colnames(theta90$intermediate))

classify <- function(Y, threshold = 0) {
  Y <- Y[,c("y1", "y2")]
  z <- (Y > threshold)
  resp <- as.factor(ifelse((z[,1] + z[,2]) > 1, 1, 0))
  return(resp)
}

formula <- response~y1+y2~x1+x2

fit <- inclass(formula, data = dataset, pFUN = list(list(model = lm)), 
 cFUN = classify)

predict(object = fit, newdata = dataset)


data(Smoking)

# explanatory variables are: TarY, NicY, COY, Sex, Age
# intermediate variables are: TVPS, BPNL, COHB
# reponse is defined by:

classify <- function(data){
  data <- data[,c("TVPS", "BPNL", "COHB")]
  res <- t(t(data) > c(4438, 232.5, 58))
  res <- as.factor(ifelse(apply(res, 1, sum) > 2, 1, 0))
  res
}

response <- classify(Smoking[ ,c("TVPS", "BPNL", "COHB")])
smoking <- cbind(Smoking, response)

formula <- response~TVPS+BPNL+COHB~TarY+NicY+COY+Sex+Age

fit <- inclass(formula, data = smoking, 
  pFUN = list(list(model = lm)), cFUN = classify)


predict(object = fit, newdata = smoking)


data(GlaucomaMVF)
glaucoma <- GlaucomaMVF[,(names(GlaucomaMVF) != "tension")]
# explanatory variables are derived by laser scanning image and intra occular pressure
# intermediate variables are: clv, cs, lora
# response is defined by

classify <- function (data) {
  attach(data) 
  res <- ifelse((!is.na(clv) & !is.na(lora) & clv >= 5.1 & lora >= 
        49.23372) | (!is.na(clv) & !is.na(lora) & !is.na(cs) & 
        clv < 5.1 & lora >= 58.55409 & cs < 1.405) | (is.na(clv) & 
        !is.na(lora) & !is.na(cs) & lora >= 58.55409 & cs < 1.405) | 
        (!is.na(clv) & is.na(lora) & cs < 1.405), 0, 1)
  detach(data)
  factor (res, labels = c("glaucoma", "normal"))
}

fit <- inclass(Class~clv+lora+cs~., data = glaucoma, 
             pFUN = list(list(model = rpart)), cFUN = classify)

data(GlaucomaM)
predict(object = fit, newdata = GlaucomaM)

}

\keyword{misc}
\eof
\name{predict.ipredknn}
\alias{predict.ipredknn}
\title{ Predictions from k-Nearest Neighbors }
\description{
  Predict the class of a new observation based on k-NN. 
}
\usage{
\method{predict}{ipredknn}(object, newdata, type=c("prob", "class"), ...)
}
\arguments{
  \item{object}{object of class \code{ipredknn}.}
  \item{newdata}{a data frame of new observations. }
  \item{type}{return either class probability estimates or the predicted
class itself.}
  \item{...}{additional arguments.}
}
\details{
  This function is a method for the generic function \code{\link{predict}}
for class \code{ipredknn}. For the details see \code{\link[class]{knn}}.

}
\value{
  Either a matrix of predicted class probabilities or the predicted classes
itself.
}

\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }

\keyword{multivariate}
\eof
\name{predict.slda}
\alias{predict.slda}
\title{ Predictions from Stabilised Linear Discriminant Analysis }
\description{
  Predict the class of a new observation based on stabilised LDA.  
}
\usage{
\method{predict}{slda}(object, newdata, ...)
}
\arguments{
  \item{object}{object of class \code{slda}.}
  \item{newdata}{a data frame of new observations. }
  \item{...}{additional arguments passed to
\code{\link[MASS]{predict.lda}}.}
}
\details{
  This function is a method for the generic function \code{\link{predict}}
for class \code{slda}. For the details see \code{\link[MASS]{predict.lda}}.

}
\value{
A list with components
\item{class}{the predicted class (a factor).}
\item{posterior}{posterior probabilities for the classes.}
\item{x}{the scores of test cases.}
}

\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }

\keyword{multivariate}
\eof
\name{print.classbagg}
\alias{print}
\alias{print.classbagg}
\alias{print.regbagg}
\alias{print.survbagg}
\title{Print Method for Bagging Trees}
\description{
  Print objects returned by \code{\link{bagging}} in nice layout.
}
\usage{
\method{print}{classbagg}(x, digits, \dots)
}
\arguments{
  \item{x}{object returned by \code{\link{bagging}}.}
  \item{digits}{how many digits should be printed.}
  \item{\dots}{further arguments to be passed to or from methods.}  
}
\value{
  none
}
\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\keyword{tree}
\eof
\name{print.cvclass}
\alias{print.cvclass}
\alias{print.cvreg}
\alias{print.cvsurv}
\alias{print.bootestclass}
\alias{print.bootestreg}
\alias{print.bootestsurv}
\title{Print Method for Error Rate Estimators}
\description{
 Print objects returned by \code{\link{errorest}} in nice layout.
}
\usage{
\method{print}{cvclass}(x, digits=4, ...)
}

\arguments{
  \item{x}{an object returned by \code{\link{errorest}}.}
  \item{digits}{how many digits should be printed.}
  \item{\dots}{further arguments to be passed to or from methods.}
}
\value{
  none
}
\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\keyword{misc}
\eof
\name{print.inbagg}
\alias{print.inbagg}
\title{Print Method for Inbagg Object}
\description{
Print object of class \code{inbagg} in nice layout.
}
\usage{
print.inbagg(x, ...)
}
\arguments{
  \item{x}{object of class \code{inbagg}.}
  \item{\dots}{additional arguments.}
}
\details{
An object of class \code{inbagg} is printed. Information about number and names of the intermediate variables,
and the number of drawn bootstrap samples is given.
}
\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}
\keyword{misc}
\eof
\name{print.inclass}
\alias{print.inclass}
\title{Print Method for Inclass Object}
\description{
Print object of class \code{inclass} in nice layout.
}
\usage{
print.inclass(x, ...)
}
\arguments{
  \item{x}{object of class \code{inclass}.}
  \item{\dots}{additional arguments.}
}
\details{
An object of class \code{inclass} is printed. Information about number and names of the intermediate variables, the used modelling technique and the number of
drawn bootstrap samples is given.
}
\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}
\keyword{misc}
\eof
\name{prune.classbagg}
\alias{prune.classbagg}
\alias{prune.regbagg}
\alias{prune.survbagg}
\title{ Pruning for Bagging  }
\description{
  Prune each of the trees returned by \code{\link{bagging}}. 
}
\usage{
\method{prune}{classbagg}(tree, cp=0.01,...)
}
\arguments{
  \item{tree}{ an object returned by \code{\link{bagging}} 
              (calling this \code{tree} is needed by the generic function 
              \code{prune} in package \code{rpart}).}
  \item{cp}{complexity parameter, see \code{\link[rpart]{prune.rpart}}.}
  \item{...}{additional arguments to \code{\link[rpart]{prune.rpart}}.}
}
\details{
  By default, \code{\link{bagging}} grows classification 
  trees of maximal size. One may want to prune each tree, however, 
  it is not clear whether or not this may decrease prediction error. 
}
\value{
  An object of the same class as \code{tree} with the trees pruned. 
}
\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\examples{

data(Glass)

mod <- bagging(Type ~ ., data=Glass, nbagg=10, coob=TRUE)
pmod <- prune(mod)
print(pmod)


}
\keyword{tree}
\eof
\name{rsurv}
\alias{rsurv}
\title{ Simulate Survival Data }
\description{
  Simulation Setup for Survival Data.
}
\usage{
rsurv(N, model=c("A", "B", "C", "D", "tree"), gamma=NULL, fact=1, pnon=10,
      gethaz=FALSE)
}
\arguments{
  \item{N}{ number of observations. }
  \item{model}{ type of model. }
  \item{gamma}{simulate censoring time as runif(N, 0, gamma). Defaults to
               \code{NULL} (no censoring).}
  \item{fact}{scale parameter for \code{model=tree}.}
  \item{pnon}{number of additional non-informative variables for the tree
              model.}
  \item{gethaz}{logical, indicating wheather the hazard rate for each 
                observation should be returned.}
}
\details{
  Simulation setup similar to configurations used in LeBlanc and Crowley
(1992) or Keles and Segal (2002) as well as a tree model used in Hothorn et
al. (2002). See Hothorn et al. (2003) for the details. 
}
\value{
  A data  frame with elements \code{time}, \code{cens}, \code{X1} ...
\code{X5}. If \code{pnon} > 0, additional noninformative covariables are
added. If \code{gethaz=TRUE}, the \code{hazard} attribute returns the hazard
rates.
}
\references{ 

 M. LeBlanc and J. Crowley (1992), Relative Risk Trees for 
 Censored Survival Data. \emph{Biometrics} \bold{48}, 411--425.

 S. Keles and M. R. Segal (2002), Residual-based tree-structured 
 survival analysis. \emph{Statistics in Medicine}, \bold{21}, 313--326.

 Torsten Hothorn, Berthold Lausen, Axel Benner and Martin
 Radespiel-Troeger (2003), Bagging Survival Trees. \emph{Statistics in
 Medicine} (accepted). Preprint available from
 \url{http://www.mathpreprints.com/math/Preprint/blausen/20020518/2}.
 

}
\examples{

# 3*X1 + X2
simdat <- rsurv(500, model="C")
coxph(Surv(time, cens) ~ ., data=simdat)

}
\keyword{survival}
\eof
\name{sbrier}
\alias{sbrier}
\title{ Model Fit for Survival Data }
\description{
  Model fit for survival data: the integrated Brier score for censored
observations.
}
\usage{
sbrier(obj, pred, btime=c(0, max(obj[,1])))
}
\arguments{
  \item{obj}{an object of class \code{Surv}.}
  \item{pred}{predicted values. Either a probability or a list of
               \code{survfit} objects. }
  \item{btime}{numeric vector of times, the integrated Brier score is
               computed if this is of \code{length > 1}. 
               The Brier score at \code{btime}
               is returned otherwise.}
}
\details{
  There is no obvious criterion of model fit for censored data. The Brier
score for censoring as well as it's integrated version were suggested by
Graf et al (1999).
}
\value{
  The (integrated) Brier score with attribute \code{time} is returned. 
}
\references{ 

Erika Graf, Claudia Schmoor, Willi Sauerbrei and Martin Schumacher (1999),
Assessment and comparison of prognostic classification schemes for
survival data. \emph{Statistics in Medicine} \bold{18}(17-18), 2529--2545.

}

\examples{

data(DLBCL)
smod <- Surv(DLBCL$time, DLBCL$cens)

KM <- survfit(smod)
# integrated Brier score up to max(DLBCL$time)
sbrier(smod, KM)

# integrated Brier score up to time=50
sbrier(smod, KM, btime=c(0, 50))

# Brier score for time=50
sbrier(smod, KM, btime=50)

# a "real" model: one single survival tree with Intern. Prognostic Index
# and mean gene expression in the first cluster as predictors
mod <- bagging(Surv(time, cens) ~ MGEc.1 + IPI, data=DLBCL, nbagg=1)

# this is a list of survfit objects (==KM-curves), one for each observation
# in DLBCL
pred <- predict(mod, newdata=DLBCL)

# integrated Brier score up to max(time)
sbrier(smod, pred)

# Brier score at time=50
sbrier(smod, pred, btime=50)
# artificial examples and illustrations

cleans <- function(x) { attr(x, "time") <- NULL; names(x) <- NULL; x }

n <- 100
time <- rpois(n, 20)
cens <- rep(1, n)

# checks, Graf et al. page 2536, no censoring at all!
# no information: \pi(t) = 0.5 

a <- sbrier(Surv(time, cens), rep(0.5, n), time[50])
stopifnot(all.equal(cleans(a),0.25))

# some information: \pi(t) = S(t)

n <- 100
time <- 1:100
mod <- survfit(Surv(time, cens))
a <- sbrier(Surv(time, cens), rep(list(mod), n))
mymin <- mod$surv * (1 - mod$surv)
stopifnot(all.equal(cleans(a),sum(mymin)/max(time)))

# independent of ordering
rand <- sample(1:100)
b <- sbrier(Surv(time, cens)[rand], rep(list(mod), n)[rand])
stopifnot(all.equal(cleans(a), cleans(b)))

\testonly{
  # total information: \pi(t | X) known for every obs

  time <- 1:10
  cens <- rep(1,10)
  pred <- diag(10)
  pred[upper.tri(pred)] <- 1
  diag(pred) <- 0
  # <FIXME>
  # a <- sbrier(Surv(time, cens), pred)
  # stopifnot(all.equal(a, 0))
  # </FIXME>
}

# 2 groups at different risk

time <- c(1:10, 21:30)
strata <- c(rep(1, 10), rep(2, 10))
cens <- rep(1, length(time))

# no information about the groups

a <- sbrier(Surv(time, cens), survfit(Surv(time, cens)))
b <- sbrier(Surv(time, cens), rep(list(survfit(Surv(time, cens))), 20))
stopifnot(all.equal(a, b))

# risk groups known

mod <- survfit(Surv(time, cens) ~ strata)
b <- sbrier(Surv(time, cens), c(rep(list(mod[1]), 10), rep(list(mod[2]), 10)))
stopifnot(a > b)

}
\keyword{survival}
\eof
\name{slda}
\alias{slda}
\alias{slda.default}
\alias{slda.formula}
\alias{slda.factor}
\title{ Stabilised Linear Discriminant Analysis }
\description{
  Linear discriminant analysis based on left-spherically 
  distributed linear scores. }
}
\usage{
\method{slda}{formula}(formula, data, subset, na.action=na.rpart, \dots)
\method{slda}{factor}(y, X, q=NULL, \dots)
}
\arguments{
  \item{y}{the response variable: a factor vector of class labels.}
  \item{X}{a data frame of predictor variables.}
  \item{q}{the number of positive eigenvalues the scores are derived from,
           see below.}
  \item{formula}{a formula of the form \code{lhs ~ rhs} where \code{lhs} 
                 is the response variable and \code{rhs} a set of
                 predictors.}
  \item{data}{optional data frame containing the variables in the
              model formula.} 
  \item{subset}{optional vector specifying a subset of observations
                to be used.}
  \item{na.action}{function which indicates what should happen when
                   the data contain \code{NA}s.  Defaults to
                   \code{\link[rpart]{na.rpart}}.}
  \item{...}{additional parameters passed to \code{\link[MASS]{lda}}.}
}

\details{
  This function implements the LDA for \eqn{q}-dimensional linear scores of
the original \eqn{p} predictors derived from the \eqn{PC_q} rule by Laeuter
et al. (1998). Based on the product sum matrix 
\deqn{W = (X - \bar{X})^\top(X - \bar{X})}
the eigenvalue problem \eqn{WD = diag(W)DL} is solved. The first \eqn{q}
columns \eqn{D_q} of \eqn{D} are used as a weight matrix for the 
original \eqn{p} predictors: \eqn{XD_q}. By default, \eqn{q} is the number
of eigenvalues greater one. The \eqn{q}-dimensional linear scores are
left-spherically distributed and are used as predictors for a classical 
LDA. 

This form of reduction of the dimensionality was 
developed for discriminant analysis problems by Laeuter (1992) and was used
for multivariate tests by Laeuter et al. (1998), Kropf (2000) gives an
overview. For details on left-spherically distributions see Fang and 
Zhang (1990).  

}

\value{
  An object of class \code{slda}, a list with components
  \item{scores}{the weight matrix.}
  \item{mylda}{an object of class \code{lda}.}
}

\seealso{
  \code{\link{predict.slda}}
}

\references{ 

Fang Kai-Tai and Zhang Yao-Ting (1990), \emph{Generalized Multivariate
Analysis}, Springer, Berlin.

Siegfried Kropf (2000), \emph{Hochdimensionale multivariate Verfahren in der
medizinischen Statistik}, Shaker Verlag, Aachen (in german).

Juergen Laeuter (1992), \emph{Stabile multivariate Verfahren},
Akademie Verlag, Berlin (in german).

Juergen Laeuter, Ekkehard Glimm and Siegfried Kropf (1998), Multivariate
Tests Based on Left-Spherically Distributed Linear Scores. \emph{The Annals
of Statistics}, \bold{26}(5) 1972--1988. 



}

\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\examples{

learn <- as.data.frame(mlbench.twonorm(100))
test <- as.data.frame(mlbench.twonorm(1000))

mlda <- lda(classes ~ ., data=learn)
mslda <- slda(classes ~ ., data=learn)

print(mean(predict(mlda, newdata=test)$class != test$classes))
print(mean(predict(mslda, newdata=test)$class != test$classes))

}
\keyword{multivariate}
\eof
\name{summary.classbagg}
\alias{summary.classbagg}
\alias{summary.regbagg}
\alias{summary.survbagg}
\alias{print.summary.bagging}
\title{Summarising Bagging}
\description{
  \code{summary} method for objects returned by \code{\link{bagging}}.
}
\usage{
\method{summary}{classbagg}(object, \dots)
}
\arguments{
  \item{object}{object returned by \code{\link{bagging}}.}
  \item{\dots}{further arguments to be passed to or from methods.}  
}
\details{
  A representation of all trees in the object is printed.
}
\value{
  none
}
\author{ Torsten.Hothorn <Torsten.Hothorn@rzmail.uni-erlangen.de> }
\keyword{tree}
\eof
\name{summary.inbagg}
\alias{summary.inbagg}
\alias{print.summary.inbagg}
\title{Summarising Inbagg}
\description{
Summary of inbagg is returned.
}
\usage{
\method{summary}{inbagg}(object, ...)
}
\arguments{
  \item{object}{an object of class \code{inbagg}.}
  \item{\dots}{additional arguments.}
}
\details{
A representation of an indirect bagging model 
(the intermediates variables, the number of bootstrap samples, the trees) is printed.
}
\value{
none
}


\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}


\seealso{\code{\link{print.summary.inbagg}}}


\keyword{misc}

\eof
\name{summary.inclass}
\alias{summary.inclass}
\alias{print.summary.inclass}
\title{Summarising Inclass}
\description{
Summary of inclass is returned.
}
\usage{
\method{summary}{inclass}(object, ...)
}
\arguments{
  \item{object}{an object of class \code{inclass}.}
  \item{\dots}{additional arguments.}
}
\details{
A representation of an indirect classification model 
(the intermediates variables, which modelling technique is used and the
prediction model) is printed.
}
\value{
none
}


\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}


\seealso{\code{\link{print.summary.inclass}}}


\keyword{misc}

\eof
\name{varset}
\alias{varset}

\title{Simulation Model}
\description{
Three sets of variables are calculated: explanatory, intermediate and response variables. 
}
\usage{
varset(N, sigma=0.1, theta=90, threshold=0, u=1:3)
}

\arguments{
  \item{N}{number of simulated observations.}
  \item{sigma}{standard deviation of the error term.}
  \item{theta}{angle between two u vectors.}
  \item{threshold}{cutpoint for classifying to 0 or 1.}
  \item{u}{starting values.}
}
\details{

For each observation values of two explanatory variables \eqn{x = (x_1, x_2)^{\top}} and of two responses \eqn{y = (y_1, y_2)^{\top}} are simulated, following the formula:
\deqn{
y = U*x+e = ({u_1^{\top} \atop u_2^{\top}})*x+e
}
where x is the evaluation of as standard normal random variable and e is generated by a normal variable with standard deviation \code{sigma}. U is a 2*2 Matrix, where 
\deqn{
u_1 = ({u_{1, 1} \atop u_{1, 2}}),
u_2 = ({u_{2, 1} \atop u_{2, 2}}),
||u_1|| = ||u_2|| = 1,
}
i.e. a matrix of two normalised vectors.
}
\value{
  A list containing the following arguments
  \item{explanatory}{N*2 matrix of 2 explanatory variables.}
  \item{intermediate}{N*2 matrix of 2 intermediate variables.}
  \item{response}{response vectors with values 0 or 1.}
}

\references{
David J. Hand, Hua Gui Li, Niall M. Adams (2001),
Supervised classification with structured class definitions.
\emph{Computational Statistics \& Data Analysis} \bold{36},
209--225.
}

\author{Andrea Peters <Peters.Andrea@imbe.imed.uni-erlangen.de>}

\examples{

theta90 <- varset(N = 1000, sigma = 0.1, theta = 90, threshold = 0)
theta0 <- varset(N = 1000, sigma = 0.1, theta = 0, threshold = 0)
par(mfrow = c(1, 2))
plot(theta0$intermediate)
plot(theta90$intermediate)

}

\keyword{misc}
\eof
