### Name: aregImpute
### Title: Multiple Imputation using Additive Regression, Bootstrapping,
###   and Predictive Mean Matching
### Aliases: aregImpute print.aregImpute plot.aregImpute
### Keywords: smooth regression multivariate methods models

### ** Examples

# Multiple imputation and estimation of variances and covariances of
# regression coefficient estimates accounting for imputation
# Example 1: large sample size, much missing data, no overlap in
# NAs across variables
set.seed(3)
x1 <- factor(sample(c('a','b','c'),1000,TRUE))
x2 <- (x1=='b') + 3*(x1=='c') + rnorm(1000,0,2)
x3 <- rnorm(1000)
y  <- x2 + 1*(x1=='c') + .2*x3 + rnorm(1000,0,2)
orig.x1 <- x1[1:250]
orig.x2 <- x2[251:350]
x1[1:250] <- NA
x2[251:350] <- NA
d <- data.frame(x1,x2,x3,y)

# Use 100 imputations to better check against individual true values
f <- aregImpute(~y + x1 + x2 + x3, n.impute=100, data=d)
f
par(mfrow=c(2,1))
plot(f)
modecat <- function(u) {
 tab <- table(u)
 as.numeric(names(tab)[tab==max(tab)][1])
}
table(orig.x1,apply(f$imputed$x1, 1, modecat))
par(mfrow=c(1,1))
plot(orig.x2, apply(f$imputed$x2, 1, mean))
fmi <- fit.mult.impute(y ~ x1 + x2 + x3, lm, f, 
                       data=d)
sqrt(diag(Varcov(fmi)))
fcc <- lm(y ~ x1 + x2 + x3)
summary(fcc)   # SEs are larger than from mult. imputation

# Example 2: Very discriminating imputation models,
# x1 and x2 have some NAs on the same rows, smaller n
set.seed(5)
x1 <- factor(sample(c('a','b','c'),100,TRUE))
x2 <- (x1=='b') + 3*(x1=='c') + rnorm(100,0,.4)
x3 <- rnorm(100)
y  <- x2 + 1*(x1=='c') + .2*x3 + rnorm(100,0,.4)
orig.x1 <- x1[1:20]
orig.x2 <- x2[18:23]
x1[1:20] <- NA
x2[18:23] <- NA
#x2[21:25] <- NA
d <- data.frame(x1,x2,x3,y)
n <- naclus(d)
plot(n); naplot(n)  # Show patterns of NAs
# 100 imputations to study them; normally use 5 or 10
f  <- aregImpute(~y + x1 + x2 + x3, n.impute=100, defaultLinear=TRUE, data=d)
par(mfrow=c(2,3))
plot(f, diagnostics=TRUE, maxn=2)
# Note: diagnostics=TRUE makes graphs similar to those made by:
# r <- range(f$imputed$x2, orig.x2)
# for(i in 1:6) {  # use 1:2 to mimic maxn=2
#   plot(1:100, f$imputed$x2[i,], ylim=r,
#        ylab=paste("Imputations for Obs.",i))
#   abline(h=orig.x2[i],lty=2)
# }

table(orig.x1,apply(f$imputed$x1, 1, modecat))
par(mfrow=c(1,1))
plot(orig.x2, apply(f$imputed$x2, 1, mean))

fmi <- fit.mult.impute(y ~ x1 + x2, lm, f, 
                       data=d)
sqrt(diag(Varcov(fmi)))
fcc <- lm(y ~ x1 + x2)
summary(fcc)   # SEs are larger than from mult. imputation

# Study relationship between smoothing parameter for weighting function
# (multiplier of mean absolute distance of transformed predicted
# values, used in tricube weighting function) and standard deviation
# of multiple imputations.  SDs are computed from average variances
# across subjects.  match="closest" same as match="weighted" with
# small value of fweighted.
# This example also shows problems with predicted mean
# matching almost always giving the same imputed values when there is
# only one predictor (regression coefficients change over multiple
# imputations but predicted values are virtually 1-1 functions of each
# other)

set.seed(23)
x <- runif(200)
y <- x + runif(200, -.05, .05)
r <- resid(lsfit(x,y))
rmse <- sqrt(sum(r^2)/(200-2))   # sqrt of residual MSE

y[1:20] <- NA
d <- data.frame(x,y)
f <- aregImpute(~ x + y, n.impute=10, match='closest', data=d)
sd <- sqrt(mean(apply(f$imputed$y, 1, var)))

ss <- c(0, .01, .02, seq(.05, 1, length=20))
sds <- ss; sds[1] <- sd

for(i in 2:length(ss)) {
  f <- aregImpute(~ x + y, n.impute=10, fweighted=ss[i])
  sds[i] <- sqrt(mean(apply(f$imputed$y, 1, var)))
}

plot(ss, sds, xlab='Smoothing Parameter', ylab='SD of Imputed Values',
     type='b')
abline(v=.2,  lty=2)  # default value of fweighted
abline(h=rmse, lty=2)  # root MSE of residuals from linear regression

## Not run: 
##D # Do a similar experiment for the Titanic dataset
##D getHdata(titanic3)
##D h <- lm(age ~ sex + pclass + survived, data=titanic3)
##D rmse <- summary(h)$sigma
##D set.seed(21)
##D f <- aregImpute(~ age + sex + pclass + survived, n.impute=10,
##D                 data=titanic3, match='closest')
##D sd <- sqrt(mean(apply(f$imputed$age, 1, var)))
##D 
##D ss <- c(0, .01, .02, seq(.05, 1, length=20))
##D sds <- ss; sds[1] <- sd
##D 
##D for(i in 2:length(ss)) {
##D   f <- aregImpute(~ age + sex + pclass + survived, data=titanic3,
##D                   n.impute=10, fweighted=ss[i])
##D   sds[i] <- sqrt(mean(apply(f$imputed$age, 1, var)))
##D }
##D 
##D plot(ss, sds, xlab='Smoothing Parameter', ylab='SD of Imputed Values',
##D      type='b')
##D abline(v=.2,   lty=2)  # default value of fweighted
##D abline(h=rmse, lty=2)  # root MSE of residuals from linear regression
## End(Not run)



