noverlap.for            package:noverlap            R Documentation

_M_i_n_i_m_a_l _n_u_m_b_e_r _o_f _o_v_e_r_l_a_p _b_a_s_e_d _o_n _a_n _a_f_f_i_n_e _h_y_p_e_r_p_l_a_n_e _i_n
_b_i_n_a_r_y _r_e_g_r_e_s_s_i_o_n (_c_o_m_p_l_e_t_e _o_r _q_u_a_s_i_c_o_m_p_l_e_t_e _s_e_p_a_r_a_t_i_o_n)

_D_e_s_c_r_i_p_t_i_o_n:

     Applies the regression depth method (RDM) to binary regression.
     This method computes approximately the number of data points that
     can be removed from a data set such that the remaining data set
     has NO overlap, i.e. that the remaining data set has complete
     separation or quasi-complete separation. If Noverlap=0 then these
     maximum likelihood estimates for the parameter vector in many
     binary regression models such as logistic regression or probit
     regression do not exist.

_U_s_a_g_e:

     noverlap.for(Z,NDIR=10000,PLOT=FALSE)

_A_r_g_u_m_e_n_t_s:

       Z: The data set Z has to be a matrix with ncol(Z)-1 <= nrow(Z)
          <= 10000. The first ncol(Z)-1 columns of Z are the design
          matrix X. The last column of Z is the binary response vector
          y (0/1).

    NDIR: Maximal number of directions (integer)

    PLOT: logical, if TRUE then draw a plot

_V_a_l_u_e:

     A list with components 

       Z: data matrix

    NDIR: 10000 directions

    PLOT: FALSE, i.e. make no plot

_A_u_t_h_o_r(_s):

     Andreas Christmann, Peter J. Rousseeuw
     Christmann@statistik.uni-dortmund.de

_R_e_f_e_r_e_n_c_e_s:

     Christmann, A., Rousseeuw, P.J. (2001). Measuring overlap in
     logistic regression. _Computational Statistics and Data Analysis_,
     *37*, 65-75.

     Christmann, A. (2002). _Classification based on the support vector
     machine and on regression depth._ In: Y. Dodge (Ed.): Statistical
     Data Analysis Based on the L1-Norm and Related Methods. Series:
     Statistics for industry and technology. Birkhaeuser, Basel, pp.
     341-352.

     Christmann, A., Fischer, P., Joachims, T. (2002). Comparison
     between various regression depth methods and the support vector
     machine to approximate the minimum number of misclassifications.
     _Computational Statistics_, *17*, 273-287.

_S_e_e _A_l_s_o:

     'noverlap1.for'

_E_x_a_m_p_l_e_s:

     data(Z2)
     noverlap.for(Z2)
     noverlap.for(Z2,NDIR=100000)
     # x11()
     postscript(file="tmp1.ps")
     par(mfrow=c(2,1))
     noverlap.for(Z2,NDIR=10000,PLOT=TRUE)
     tmp <- noverlap.for(Z2)
     tmp$NOVERLAP
     tmp$COEFFICIENTS
     tmp$NSIN
     tmp$DETAILS
     Z3 <- as.data.frame(Z2)
     names(Z3) <- c("x1","x2","y")
     plot(x2 ~ x1, data=Z3,pch=as.character(y),main="Scatterplot")
     abline(c(0,1.5),col="blue")
     points(Z3[2,1],Z3[2,2],pch=as.character(Z3[2,3]),col="red")
     dev.off()

     # NO OVERLAP: maximum likelihood estimates do NOT exist
     data(Z1)
     Z1
     # X11()
     postscript(file="tmp2.ps")
     noverlap.for(Z1)
     tmp <- noverlap.for(Z1)
     tmp$NOVERLAP
     tmp$COEFFICIENTS
     tmp$NSIN
     tmp$DETAILS
     Z3 <- as.data.frame(Z1)
     names(Z3) <- c("x1","y")
     plot(y ~ x1, data=Z3,pch=as.character(y),main="Scatterplot")
     summary(glm(y ~ x1, data=Z3, family=binomial(link=logit), trace=TRUE, maxit=30))
     dev.off()

     # NO OVERLAP: maximum likelihood estimates in the logistic regression model
     # do NOT exist for the banknotes data set
     data(Banknotes)
     Banknotes
     # X11()
     postscript(file="tmp3.ps")
     tmp <- noverlap.for(Banknotes,PLOT=TRUE)
     dev.off()
     tmp$NOVERLAP
     tmp$COEFFICIENTS
     Z3 <- as.data.frame(Banknotes)
     names(Z3) <- c("x1","x2", "x3", "x4", "x5", "x6","y")
     summary(glm(y ~ x1+x2+x3+x4+x5+x6, data=Z3, family=binomial(link=logit), trace=TRUE, maxit=30))

