.packageName <- "vbmp"
`computeKernel` <-
function(X1, X2, kName, kParams){
    ##
    ##	X1	     N1 x d data matrix
    ##	X2	     N2 x d data matrix
    ##  kName    string name of kernel function
    ##           Prefix with '+' to add bias (e.g. '+gauss')
    ##		       Prefix with '*' to add bias and augmenting with X2 covariate (e.g. '*gauss')
    ##  kParams  vector parameters of kernel functions
    ##
    ##	returns N1 x N2 (or N2+1 if bias, or N2+ncol(X2)+1 for X2 augmentation ) design matrix.
    ##			The first column comprises 1's if a bias is used
    ##      The subsequent ncol(X2) columns comprises X2 covariate if X2 augmentation is used
    N1 <- nrow(X1); d <- ncol(X1);
    N2 <- nrow(X2);
    if (length(kParams) == 1) kParams <- rep(kParams,d);
    varParams <- NULL;
    if (substr(kName, 1, 1) == '+') {
        b <- matrix(1., nrow=N1, ncol=1);
        kName	<- substr(kName, 2, nchar(kName));
    } else {
        if (substr(kName, 1, 1) == '*') {
            b <- as.matrix(cbind(rep(1.,N1), X1));
            kName <- substr(kName, 2, nchar(kName));
            if (length(kParams) == d)  {
                varParams <- c(1., rep(1,d));
            } else {
                # rescaling X2 compontent differently from that used for kernel
                varParams <- c(1., kParams[1:d]);
                kParams <- kParams[(d+1):(2*d)];
            }
        } else b <- NULL;
    }
    ##    poly' --> Polynomial   e.g. "poly5"
    if (nchar(kName)>=4 && substr(kName, 1, 4) == 'poly') {
        p <- as.numeric(substr(kName, 5, nchar(kName)));
        kName	= 'poly';
    }     
    ##    'hpoly' --> Homogeneous Polynomial
    if (nchar(kName)>=5 && substr(kName, 1, 5) == 'hpoly') {
        p <- as.numeric(substr(kName, 6, nchar(kName)));
        kName	<- 'hpoly';
    }
    X1 <- t(as.matrix(apply(X1, 1, function(x) x/sqrt(kParams))));
    X2 <- t(as.matrix(apply(X2, 1, function(x) x/sqrt(kParams))));
    if (kName =='gauss') {                 ##	Gaussian
        PHI <- exp(-distSqrd(X1, X2));
    } else if (kName == 'iprod') {         ##	Inner product
        PHI <- tcrossprod(X1, X2); 
    } else if (kName == 'tps') {           ##	'Thin-plate' spline
        r2  <- distSqrd(X1, X2);
        PHI <- 0.5 * r2 *log(sqrt(r2)); 
    } else if (kName == 'cauchy') {        ##	Cauchy (heavy tailed) in distance
        r2  <- distSqrd(X1, X2);
        PHI <- 1./(1. + r2);
    } else if (kName == 'cubic') {         ##	Cube of distance
        r2 <- eta * distSqrd(X1, X2);
        PHI <- r2 * sqrt(r2);
    } else if (kName == 'r') {             ##	Distance
        PHI <- sqrt(eta) * sqrt(distSqrd(X1, X2));
    } else if (kName == 'neighbour') {     ## Neighbourhood indicator
        r2  <- eta * distSqrd(X1, X2);
        PHI <- ifelse(r2 < 1, 1, 0);
    } else if (kName == 'laplace'){        ##	Laplacian
        r2  <- distSqrd(X1, X2);
        PHI <- exp(-sqrt(r2));
    }  else if (kName == 'poly') {         ##  polinomial (X + 1)^p;
        PHI <- (X1%*%t(X2) + 1)^p;
    }  else if (kName == 'hpoly') {        ##  homogeneous polynomial  X^p;
        PHI <- (X1%*%t(X2))^p;              
    } else if (kName == 'lsp') {           ##  'linear' spline kernel
        PHI <- 1.;
        for (i in 1:d) {
            XX  <- X1[,i] %o% X2[, i];
            Xx1 <- X1[,i] %o% rep(1, N2);
            Xx2 <- rep(1,N1) %o% X2[, i];
            minXX <- Xx2; 
            minXX[Xx1 - Xx2 > 0] <- Xx1[Xx1 - Xx2 > 0];
            PHI <- PHI * (1. + XX + XX*minXX - ((Xx1 + Xx2)/2.)*(minXX^2) + (minXX^3)/3.);
        }
    }
    if (any(b)) {
        if (length(varParams) > 0) b <- t(as.matrix(apply(b, 1, function(x) x/varParams)));
        PHI <- as.matrix(cbind(b, PHI));
    }
    PHI;
}

`covParams` <-
function(obj) {
    obj$THETA[nrow(obj$THETA),];
}

`distSqrd` <-
function (X,Y) {
    nx	= nrow(X);
    ny	= nrow(Y);
    abs(rowSums(X^2)%o%rep(1, ny)  +  rep(1, nx)%o%rowSums(Y^2) - 2.*(X %*% t(Y)));
}

`lowerBound` <-
function(obj) {
    obj$lowerBound[length(obj$lowerBound)];
}

`plotDiagnostics` <-
function(obj) {
   if (nrow(obj$theta) > 1)  {
      par(mfrow=c(2, 2));
      ## plot covariance parameters evolution
      scov <- matrix(as.numeric(safeLog(obj$theta)), ncol=ncol(obj$theta), 
         nrow=nrow(obj$theta));
      plot(NULL, type="n", xlim=c(1,nrow(scov)), xlab="Iteration",
         main="Covariance Parameters",
         ylim=c(min(scov)-1e-6, max(scov)+1e-6), ylab="log(theta)");
      for (kkk in 1:ncol(scov)) {
          lines(scov[, kkk],  lty="dotdash", col=kkk);
      }   
   } else par(mfrow=c(1, 3));
   ## plot lower bound evolution
   plot(obj$lowerBound, type="l", main="Lower Bound", lty="dotdash", 
      xlab="Iteration", ylab="Lower bound");
   ## plot PL evolution
   plot(obj$PL, type="l", main="Predictive Likelihood", lty="dotdash", 
      xlab="Iteration", ylab="PL");
   ## plot test error
   plot((100-obj$testErr), type="l", lty="dotdash", xlab="Iteration", 
      ylab="Accuracy %", main="Out-of-Sample Prediction Correct");
   par(mfrow=c(1,1))
}

`predClass` <-
function(obj) {
    as.numeric(apply(obj$Ptest, 1, which.max));
}

`predError` <-
function(obj) {
    obj$testErr[length(obj$testErr)]/100.;
}

`predLik` <-
function(obj) {
    obj$PL[length(obj$PL)];
}

`printTrace` <-
function(mystr, sFile, InfoLevel, bAppend=TRUE, bStop=FALSE){
    if (InfoLevel > 0) {
        if (length(sFile) > 0) {
            cat(mystr, file=sFile, sep="\n", fill=TRUE, labels=NULL, append=bAppend)
        } else cat(mystr, sep="\n");
        flush.console();
        if (bStop) stop(mystr);
    }
}

`rexponential` <-
function(lambda) {
    #-log(runif(length(lambda)))/lambda
    rexp(rep(1,length(lambda)), rate=lambda);
}

`safeLog` <-
function(x) {
    G.THRESH.LOG      <-  1e-200; ## threshold used to avoid numerical problem
    x[x < G.THRESH.LOG]    <- G.THRESH.LOG;
    x[x > 1./G.THRESH.LOG] <- 1./G.THRESH.LOG;
    log(x);
}

`safeLogDet` <-
function(x) {
    safeLog(det(x));  # 2*sum(safeLog(diag(chol(x))))
}

`safeNormCDF` <-
function(x) {
    G.THRESH.NORM.CDF <-  10;     ## threshold used to avoid numerical problem
    x[x < -G.THRESH.NORM.CDF] <- -G.THRESH.NORM.CDF;
    pnorm(x);
}

`safeNormPDF` <-
function(x) {
    G.THRESH.NORM.PDF <-  35;     ## threshold used to avoid numerical problem
    x[x < -G.THRESH.NORM.PDF] <- -G.THRESH.NORM.PDF;
    x[x >  G.THRESH.NORM.PDF] <-  G.THRESH.NORM.PDF;
    dnorm(x);
}

`tmean` <-
function(m, indexMax, Nsamps) {
    ## This function computes the mean of the truncated Gaussian as detailed in
    ## the paper equations (5) & (6).
    u <- rnorm(Nsamps);
    Kc <- length(m);
    t.class   <- rep(m[indexMax], Kc) - m;
    tr.class  <- t.class;
    t.class   <- t.class[-indexMax];
    ## s ( Nsamps x Kc-1 )
    s  <- matrix(rep(u, Kc-1),byrow=F, nrow=Nsamps, ncol=Kc-1) +
        t(matrix(rep(t.class, Nsamps), byrow=F, ncol=Nsamps));
    if (is.null(dim(s)) || ncol(s) == 1 || nrow(s) == 1) {
        z <- mean(safeNormCDF(s));
    } else {
        z <- mean( as.numeric(apply(safeNormCDF(s), 1, prod)));
    }
    if (Kc > 2) {
        tm <- rep(NA, Kc);
        for (r in 1:Kc) {
            ## sr ( Nsamps x Kc )
            sr <- matrix(rep(u, Kc), byrow=F, nrow=Nsamps) +
                t(matrix(rep(tr.class,Nsamps), byrow=F, ncol=Nsamps));
            sr <- sr[, -c(r, indexMax)];
            if (is.null(dim(sr)) || ncol(sr) == 1 || nrow(sr) == 1) {
                snr <- as.numeric(safeNormCDF(sr));
            } else {
                snr <- as.numeric(apply(safeNormCDF(sr), 1, prod));
            }
            nr <- mean(safeNormPDF(u + m[indexMax] - m[r]) * snr );
            if (r == indexMax) tm[r] <- 0.
            else               tm[r] <- m[r] - nr/z;
        }
        tm[indexMax] <- sum(m) - sum(tm);
    } else {
        stop('Multinomial only code !!!');
    }
    structure( list( tm=tm, z=z),	class="tmean.obj");
}

`varphiUpdate` <-
function(X, M, psi, nSamps, sKernelType) {
## This computes the posterior mean of the covariance hyperparameters
## using a simple importance sampler
    V <- NULL;
    W <- NULL;
    for (i in 1:nSamps) {
        varphi <- rexponential(psi);
        #Varphi <- diag(varphi);
        PHI <- computeKernel(X, X, sKernelType, varphi) + diag(1., nrow(X));
        invPHI <- chol2inv(chol(PHI)); # solve(PHI);
        ws <- as.numeric(prod(diag(exp(-0.5*crossprod(M, invPHI)%*% M))));
        if (is.null(V)) V <- matrix(varphi, nrow=1, ncol=length(varphi))
        else  V <- rbind(V, varphi);
        if (is.null(W)) W <- ws
        else  W <- c(W, ws);
    }
    W <- W/sum(W);
    colSums(V * matrix(rep(W, ncol(V)), byrow=FALSE, nrow=length(W), ncol=ncol(V)));
}

`vbmp` <-
function(X, t.class, X.TEST, t.class.TEST, theta,  control = list()) {
## X - Feature matrix for parameter 'estimation' - of dimension N x Kd
## t.class - The corresponing target values - class labels
## X.TEST - Feature matrix to compute out-of-sample (test) prediction errors and likelihoods
## t.TEST - Corresponding target values for test data
## theta - The covariance function parameters - e.g. scaling coefficients for each dimension
## bThetaEstimate = if covariance parameter estimation switched on - (FALSE if switched off)
## maxIts - the maximum number of variational EM steps to take
## sKernelType - Select from Gaussian, Polynomial or Linear Inner product
## Thresh - Convergence threshold on marginal likelihood lower-bound
## InfoLevel - 0 to suppress tracing ( > 0  to print different levels
##             of monitoring information)
    # ------------ check data dimension
    if (is.null(dim(X)))  X <- matrix(X,nrow=length(X),ncol=1);
    if (is.null(dim(X.TEST))){
        X.TEST <- matrix(X.TEST, ncol=ncol(X));
    } else if (ncol(X.TEST) != ncol(X)) stop("Number of cols differ between X and X.TEST");

    # ------------ check parameters    
    con <- list(InfoLevel=0, sFILE.TRACE=NULL, bThetaEstimate=FALSE, 
        sKernelType="gauss", maxIts=50, Thresh=1e-4, tmpSave=NULL,
        nSampsTG=1000, nSampsIS=1000, nSmallNo=1e-10, parGammaSigma=1e-6, 
        parGammaTau=1e-6, bMonitor=FALSE, bPlotFitting=FALSE);
    con[names(control)] <- control;
    if (con$bPlotFitting) con$bMonitor <- TRUE;
    if (is.factor(t.class) || is.character(t.class)){
        temp <- as.numeric(factor(c(as.character(t.class), as.character(t.class.TEST))));
        t.class <- temp[1:length(t.class)];
        t.class.TEST <- temp[(length(t.class)+1):length(temp)];
        rm(temp);
    }
   if (any(theta<=0)) stop("theta params (scale) must be > 0"); 
## ------------------------------------------------------------------------------
    G.LOWER.BOUND.DEFAULT <-  -1e-3; ## default lower bound value
    G.DIFF.DEFAULT        <-  1e100; ## Monitor difference in marginal likelihood
   
    printTrace(paste("Starting at: ", date(), "...... \n"),
        con$sFILE.TRACE, con$InfoLevel, bAppend=FALSE);
    if (nrow(X) == nrow(X.TEST)) {
        b.traintest <- all(X == X.TEST) && all(t.class == t.class.TEST);
    } else  b.traintest <- FALSE;
    Kc <- max(t.class);            ## Identify the number of classes
    N <- nrow(X); Kd <- ncol(X);   ## Get number of samples and dimension of data
    ## randomly initializse M,Y matrix (see paper)
    Y <- matrix(rnorm(N*Kc), nrow=N, ncol=Kc);
    M <- matrix(runif(N*Kc), nrow=N, ncol=Kc);
    ## diagonal matrix of the covariance params for passing to kernel function
    #Theta <- diag(theta);
    ## Set hyper-params for covariance params to one. In this application
    ## I have used a simple exponential distribution over the theta values so
    ## there is only a mean value required psi.
    psi   <- rep(1., length(theta));
    In    <- diag(1., N);          ## N x N dimensional identity matrix
    Ic    <- diag(1., Kc);         ## Kc x Kc dimensional identity matrix
    ## Create the covariance (kernel) matrix and add some small jitter on diagonal
    PHI <- computeKernel(X, X, con$sKernelType, theta) + In * con$nSmallNo;
    ## precompute the inverse matrices required
    invPHI <- chol2inv(chol(PHI + In));
    Ki <- PHI %*% invPHI;
    trace.Ki <- sum(diag(Ki));      
    logDetKi  <- safeLogDet(Ki);   
    logDetPHI <- safeLogDet(PHI);  
    ## Collect all the posterior mean values of the covariance params
    THETA <- matrix(theta, nrow=1, ncol=length(theta))
    ## Collect all the values of the lower-bound
    lowerBound <- G.LOWER.BOUND.DEFAULT;
    ## Monitor difference in marginal likelihood
    scan.diff <- G.DIFF.DEFAULT;
    ## Collect all values of the predictive likelihood
    PL <- NULL;
    ## Collect all values of the percentage predictions incorrect
    testErr <- NULL;    
    ## -------------------------------------------------------------------------
    ## Main loop
    ## -------------------------------------------------------------------------
    its     <- 0;         ## Initiliase iteration number
    bconverged <- FALSE;
    while ((its < con$maxIts) && (! bconverged)) {
        its <- its + 1;
        printTrace(paste(its, "> update the columns of the M-matrix ",
            "- equation (8) of the paper"), con$sFILE.TRACE, con$InfoLevel - 1);
        ## - formula (4.6)
        for (k in 1:Kc) M[, k] <- Ki %*% Y[, k];
        printTrace(paste(its, "> update the rows of the Y-matrix",
            "- equation (5) & (6) of the paper "), con$sFILE.TRACE, con$InfoLevel - 1);
        scan.lower.bound <- 0.;
        scan.tm <- NULL;
        for (n in 1:N) {
            scan.tm <- tmean(M[n,], t.class[n], con$nSampsTG);
            if (! is.null(scan.tm)) {
                Y[n,] <- scan.tm$tm;
                scan.lower.bound <- scan.lower.bound + safeLog(scan.tm$z);
                rm(scan.tm); scan.tm <- NULL;
            } else stop("tmean error.....");
        }
        if (con$bThetaEstimate) {
            printTrace(paste(its, "> update the posterior mean estimates of the",
                "covariance function parameters and hyper-params"),
                con$sFILE.TRACE, con$InfoLevel - 1);
            ## - formula (4.7)  pag 1796 
            theta <- varphiUpdate(X, M, psi, con$nSampsIS, con$sKernelType);
            #Theta <- diag(theta);
            ## - formula (4.8)  pag 1797
            psi   <- (con$parGammaSigma + 1)/(con$parGammaTau + theta);
            if (con$bMonitor) THETA <- rbind(THETA, theta);
            if (con$bPlotFitting) {
                if (its == 1) par(mfrow=c(2, 2));                
                scov <- matrix(as.numeric(safeLog(THETA)), ncol=ncol(THETA),
                    nrow=nrow(THETA));
                plot(NULL, type="n", xlim=c(1,nrow(scov)), xlab="Iteration",
                     main="Covariance Params Posterior Mean Values",
                     ylim=c(min(scov)-1e-6, max(scov)+1e-6), ylab="log(theta)");
                for (kkk in 1:ncol(scov)) {
                    lines(scov[, kkk],  lty="dotdash", col=kkk);
                }
            }
            PHI <- computeKernel(X, X, con$sKernelType, theta);
            invPHI <- chol2inv(chol(PHI + In));
            Ki <- PHI %*% invPHI;
            trace.Ki  <- sum(diag(Ki));
            logDetKi  <- safeLogDet(Ki);
            logDetPHI <- safeLogDet(PHI);
        }
        printTrace(paste(its, "> compute the lower-bound"), con$sFILE.TRACE,
            con$InfoLevel - 1);
        scan.lower.bound <- scan.lower.bound +
            - 0.5 * Kc * trace.Ki +
            - 0.5 * sum(diag(crossprod(M, invPHI) %*% M)) +
            - 0.5 * Kc * sum(diag(invPHI)) +
            - 0.5 * Kc * logDetPHI  +
            + 0.5 * Kc * logDetKi +
            - 0.5 * Kc * N * safeLog(2*pi) + 0.5*N*Kc + 0.5*N*safeLog(2*pi);
        ## update the development of the bound at every iteration
        lowerBound <- c(lowerBound, scan.lower.bound);
        if (its == 2) lowerBound[1] <- lowerBound[2];
        if (con$bPlotFitting) {
            if (its == 1 && (!con$bThetaEstimate)) par(mfrow=c(1,3));
            plot(lowerBound, type="l", main="Lower Bound",
                lty="dotdash", xlab="Iteration", ylab="Lower bound");
        }
        ## Monitoring convergence
        scan.diff <- abs(100*(scan.lower.bound - lowerBound[its])/lowerBound[its]);
        bconverged <- (scan.diff < con$Thresh);
        if (con$bMonitor || bconverged || its == con$maxIts || con$bPlotFitting) {
            ## --------------------------------------------------------------
            printTrace(paste(its, "> Compute the predictive posteriors on the test set"),
                con$sFILE.TRACE, con$InfoLevel - 1);
            ## Compute the predictive posteriors on the test set and
            ## the associated likelihood and test errors
            Ntest <- nrow(X.TEST);     ## Number of test points
            ## Create test covariance matrices required to obtain predictive
            ## mean and variance values
            if (!b.traintest) {
                PHItest <- computeKernel(X, X.TEST, con$sKernelType, theta);
                PHItestSelf <- computeKernel(X.TEST, X.TEST, con$sKernelType, theta);
            } else PHItest <- PHItestSelf <- PHI;
            ## ---------------------------------------------------------------
            ## Computes the predictive posteriors as defined in Section 4.5 of the paper.
            ## first two equations at page 1798
            Res <- t(crossprod(Y, invPHI)%*%PHItest);
            S <- (diag(PHItestSelf) - diag(crossprod(PHItest, invPHI)%*%PHItest));
            predictive.likelihood <- 0.;
            if (Kc > 2) {
                Ptest <- matrix(1., nrow=Ntest, ncol=Kc);
                u     <- rnorm(con$nSampsTG);
                for (n in 1:Ntest) {
                    for (i in 1:Kc) {
                        pp <- rep(1., con$nSampsTG);
                        for (j in ((1:Kc)[-i])) {
                            pp <- pp * safeNormCDF(u + (Res[n, i] -
                                Res[n, j])/(sqrt(1.+S[n])));
                        }
                        Ptest[n, i] <- mean(pp);
                    }
                }
            } else {
                stop("Multinomial only code....")
            }
            Ptest <- t(apply(Ptest, 1, function(x) {x/sum(x)})); ## JUST IN CASE
            ## Computes the overall predictive likelihood
            predictive.likelihood <- sum(safeLog( apply(cbind(Ptest,
                t.class.TEST), 1, function(s){s[s[Kc+1]]} )));
            if (is.null(PL)) PL <- predictive.likelihood/Ntest
            else             PL <- c(PL, predictive.likelihood/Ntest);
            if (con$bPlotFitting) {
                plot(PL, type="l", main="Predictive Likelihood",
                    lty="dotdash", xlab="Iteration");
            }
            ## Compute the 0-1 error loss.
            fvals <- as.numeric(apply(Ptest, 1, which.max));
            scanTestErr  <- 100*(sum(fvals != t.class.TEST))/Ntest;
            if (is.null(testErr)) testErr <- scanTestErr
            else testErr <- c(testErr, scanTestErr);
            if (con$bPlotFitting) {
                plot((100-testErr), type="l", lty="dotdash", xlab="Iteration",
                    main="Out-of-Sample Percent Prediction Correct");
            }
            printTrace(paste(its, "> Value of Lower-Bound =", scan.lower.bound,
                ",Prediction Error = ", scanTestErr,
                ", Predictive Likelihood = ", predictive.likelihood/Ntest),
                con$sFILE.TRACE, con$InfoLevel);
            #Acc <- 100 - testErr[its];
            vbmultiprob.obj <- structure( list(
               Ptest=Ptest, X=X, invPHI=invPHI, Y=Y, Kc=Kc, M=M,
               sKernelType=con$sKernelType, theta=THETA, 
               lowerBound=lowerBound,testErr=testErr, PL=PL),  class="VBMP.obj");
        }
        if (! is.null(con$tmpSave)) save(vbmultiprob.obj, file=con$tmpSave);
    }
    vbmultiprob.obj ;
}

