# subroutines for KernSec and KernSur
# written because much of the checking between the two functions
# was common and it seemed so much more elegant to have all data
# verification functions in a single file



# generates a suitable vector of bandwidths for dat
# from any allowed input from bandwidths which can be
# 1 - a vector of length equal to that of dat
# 2 - a single value
# 3 - nothing - if true bandwidths should be set to FALSE
bandwidthselect <- function(dat, bandwidths)
{
cases <- length(dat)
bands <- length(bandwidths)

# user supplied vector of bandwidths do nothing

# user supplied single value bandwidths
if(bands == 1){bandwidths <- rep(bandwidths, cases)}

# calculate the default bandwidths
if(bandwidths == "FALSE")
	{
	# dummy vector to get na.omit to work set up dat2 as repeated unit
	dat1 <- dat
	dat2 <- na.omit(data.frame(dat, dat1))$dat
	# trap vectors of x likely to upset dpik()
	if(length(unique(dat2)) < 5){stop("too few unique values - no auto bandwidth possible")}
	# calculate a vector of bandwidths based on dat using dpik()
	bandwidths <- rep(dpik(dat2), cases)
	}

# trap anything unexpected
if(length(bandwidths) != cases){stop("bandwidth vector of funny size")}
if(any(bandwidths == "NA")){stop("bandwidth vector contains NA's")}
return(bandwidths)
}










# outputs a vector of ordinates at which to calculate the density
# users may specify the range of the extreme ordinates, give
# a vector of ordinates themselves, or nothing, in which the
# default is gridsize equally spaced points with deadspace times the
# maximum bandwidth at each end from the data
rangeselect <- function(dat, rnge, gridsize, bandwidth, deadspace)
{
lenrange <- length(rnge)
cases <- length(dat)

min.dat <- min(dat, na.rm=TRUE)
max.dat <- max(dat, na.rm=TRUE)

# in case user has sent something completely unintelligable
if(lenrange < 2){rnge <- FALSE}

# default behaviour
if(rnge == "FALSE")
	{
	# make sure that all the values are not the same
	if((max.dat - min.dat) == 0){stop("all x's are the same - no auto range possible")}
	maxbw <- max(bandwidth, na.rm=TRUE)
	# calculate a range based upon deadspace and bandwidths
	rnge <- c( (min.dat - (deadspace * maxbw)),(max.dat + (deadspace * maxbw)) )
	# calculate the vector of ordinates
	ords <- seq(rnge[1], rnge[2], length=round(gridsize))
	}

# if user has sent values for the extreme limits of the range
if(lenrange == 2){ords <- seq(rnge[1], rnge[2], length=round(gridsize))}
# if user has specified their own ordinates
if(lenrange > 2){ords <- rnge}

# remove NA's which may have been lurking in a user supplied vector
if(any(ords == "NA"))
	{
	ords1 <- ords
	ords <- na.omit(data.frame(ords, ords1))$ords
	}
# test to see whether there are any repeated ordinate values from a user vector
if(length(unique(ords)) != length(ords)){stop("repeated values in ordinates")}

return(ords)
}








# constructs a vector of correlations to use with KernSur
# users can:
# 1 - send a single correlation of their choise
# 2 - a vector of correlations for each case
# 3 - or nothing in which case a default correlation is established
correlationselect <- function(x, y, correlation, cases)
{


# default if correlation isn't there
if(correlation == "FALSE")
	{
	min.x <- min(x, na.rm=TRUE)
	max.x <- max(x, na.rm=TRUE)
	min.y <- min(y, na.rm=TRUE)
	max.y <- max(y, na.rm=TRUE)

	if((min.x - max.x == 0) || (min.y - max.y == 0))
		{corre <- 0}
	
	if((min.x - max.x != 0) && (min.y - max.y != 0))
		{corre <- cor(x,y, use="complete.obs")}
		
	correlation <- rep(corre, cases)
	}

# if user sends a single value
if(length(correlation) == 1)
	{
	corre <- correlation
	correlation <- rep(corre, cases)
	} 

# reset untenable values for the correlation
if(length(correlation) == cases)
	{
	correlation[which(correlation > 0.999)] <- 0.999
	correlation[which(correlation < -0.999)] <- -0.999
	}

# trap bad vectors for the correlation
if(length(correlation) != cases){stop("wrong vector length for correlation")}

return(correlation)
}

# getlims.r return a vector of bin limits based on a vector of bin centres

getlims <- function(centres)
{
# set up the vector length
bins <- length(centres)

# test to see whether there are any repeated bin centre values
if(length(unique(centres)) != bins){stop("repeated values in bin centres")}

# make sure the bin centres are arranged in ascending order
if(length(unique(centres == sort(centres))) == 2){stop("bin centre values not in ascending order")}

# define a vector of bin limits to be a vector with n+1 bins
limits <- rep(0, bins+1)

# calculate the bin limits
outty <- .C("getlims", as.double(centres), as.double(limits), as.integer(bins))

# reassign the output
limits <- outty[[2]]
return(limits)
}








KernSec <- function(
		    x,
		    xgridsize=100,
		    xbandwidth,
		    range.x,
		    na.rm=FALSE
		   )
{
# multiplier for the xbandwidth which gives the tails at either end for the
# final correlated kernel density estimate only operates as default
deadspace <- 1.5       
cases <- length(x)

# flags to increase efficiency in parameter selection
flag1 <- 1
flag2 <- 1

# xbandwidth selection
	# default bandwidth
        if(missing(xbandwidth))
                {
		flag1 <- 0
		xbandwidth <- bandwidthselect(x, bandwidths=FALSE)
		} 
	# user supplied bandwidth 
	if(flag1){xbandwidth <- bandwidthselect(x, xbandwidth)}


# Do the NA handling
# put it all together into a data frame or na.omit doesn't work
z <- data.frame(x, xbandwidth)
	# if NAs not allowed fail the function
	if(na.rm == FALSE){na.fail(z)}
	# get rid of NA cases
	if(na.rm == TRUE){z <- na.omit(z)}
# reassign the vectors with NAs removed
x <- z$x; xbandwidth <- z$xbandwidth


# range selection
	# default range of xvalues
        if(missing(range.x))
                {
		flag2 <- 0
		xvals <- rangeselect(x, rnge=FALSE, xgridsize, xbandwidth, deadspace)
		} 
	# user supplied ranges
	if(flag2){xvals <- rangeselect(x, range.x, xgridsize, xbandwidth, deadspace)}


# generate a vector of length xords with zeros in it to contain
# the density estimate
xordslen <- length(xvals)
est <- rep(0, xordslen)


# invoke the .c module
out <- .C(
	 "GenKernSec",
	 as.double(x),
	 as.integer(length(x)),
	 as.double(xvals),
	 as.double(xbandwidth),
	 as.double(est),
	 as.integer(xordslen)
	 )

# assign the return values
yden <- out[[5]]

return(xvals, yden)
}

KernSur <- function(
		 x, 
		 y, 
		 xgridsize=100, 
		 ygridsize=100,	 
		 correlation, 
		 xbandwidth, 
		 ybandwidth, 
		 range.x,
		 range.y,
		 na.rm=FALSE
		 ) 

{
# multiplier for the xbandwidth which gives the tails at either end for the 
# final correlated kernel density estimate only operates as default follows
# Wand and Jones' 1.5 h default
deadspace <- 1.5        
# number of cases to deal with
cases <- length(x)		
if(cases != length(y)){stop("x and y vectors of unequal lengths")}

# set up some control variables
flag1 <- 1; flag2 <- 1; flag3 <- 1; flag4 <- 1; flag5 <- 0

# vector of correlation coefficients
	 # default behaviour
         if(missing(correlation))
                {
		flag5 <- 0
		correlation <- correlationselect(x, y, correlation=FALSE, cases)
		} 
	if(flag5){correlation <- correlationselect(x, y, correlation, cases)}


# bandwidth selection
	# default bandwidths
        if(missing(xbandwidth))
                {
		flag1 <- 0
		xbandwidth <- bandwidthselect(x, bandwidths=FALSE)
		} 
        if(missing(ybandwidth))
                {
		flag2 <- 0
		ybandwidth <- bandwidthselect(y, bandwidths=FALSE)
		} 
	# user supplied bandwidths
	if(flag1){xbandwidth <- bandwidthselect(x, xbandwidth)}
	if(flag2){ybandwidth <- bandwidthselect(y, ybandwidth)}


# x-y related stuff done now do the NA handling
# put it all together into a data frame or na.omit doesn't work
z <- data.frame(x,y, correlation, xbandwidth, ybandwidth)
	# if NAs not allowed fail the function
	if(na.rm == FALSE){na.fail(z)}
	# get rid of NA cases
	if(na.rm == TRUE){z <- na.omit(z)}
# reassign the vectors with NAs removed
x <- z$x; y <- z$y; correlation <- z$correlation; xbandwidth <- z$xbandwidth; ybandwidth <- z$ybandwidth


# range selection and ordinate generation
	# default range of values
        if(missing(range.x))
                {
		flag3 <- 0
		xvals <- rangeselect(x, rnge=FALSE, xgridsize, xbandwidth, deadspace)
		}
       if(missing(range.y))
                {
		flag4 <- 0
		yvals <- rangeselect(y, rnge=FALSE, ygridsize, ybandwidth, deadspace)
		}
	# user supplied ranges
	if(flag3){xvals <- rangeselect(x, range.x, xgridsize, xbandwidth, deadspace)}
	if(flag4){yvals <- rangeselect(y, range.y, ygridsize, ybandwidth, deadspace)}


# setup ordinate vector lengths
ordsinx <- length(xvals)
ordsiny <- length(yvals)
# reset cases with NAs removed
cases <- length(x)
# generate the vector of squared correlations
correlationsq <- correlation ^ 2 
# define the  kernelsurface array
corker <- rep(0, (ordsinx * ordsiny)) 

# invoke the .c module 
out <- .C( 
	"GenKernSur",
	 as.double(corker), 
	 as.integer(ordsinx), 
	 as.integer(ordsiny),
	 as.double(x), 
	 as.double(y), 
	 as.double(xvals), 
	 as.double(yvals), 
	 as.double(xbandwidth), 
	 as.double(ybandwidth), 
	 as.double(correlation), 
	 as.double(correlationsq),
	 as.integer(cases)
	 ) 
 
# assign the return values 
zden <- out[[1]] 
dim(zden) <- c(ordsinx, ordsiny) 
return(xvals, yvals, zden) 
}
# function nearest which picks the index of a vector which is nearest to a value
# the vector doesn't have to be in any particular order this routine will just
# give the nearest numbers index
# you can specify whether to accept values which are outside the range of the vector
# the default is not to but if you do then you can be way outside the range and
# get a return value
# the only bug is that if you don't specify that values which are outside the
# range of the vector are acceptable then any value which is equal to the minimum
# or maximum of the vector will also return an error so if too many errors
# are returned call with the outside=T flag enabled
# this function is fairly bullet proof but don't expect it to get things right when
# there are many tied values in the vector from playing with it the function returns
# vector of the tied values - so be prepared

nearest <- function(x, xval, outside=FALSE, na.rm=FALSE)
{

# Do the NA handling
# put it all together into a data frame or na.omit doesn't work
x1 <- x
z <- data.frame(x, x)
# if NAs not allowed fail the function
if(na.rm == FALSE){na.fail(z)}
# get rid of NA cases
if(na.rm == TRUE){z <- na.omit(z)}
# reassign the vectors with NAs removed
x <- z$x


# if the value is outside the range of the vector and it isn't acceptable
# then issue an error and stop
if(outside == FALSE){if((max(x) <= xval) || (min(x) >= xval)) {stop("value outside vector range")}}

# if the value is outside the range of the vector and this is acceptable then
# merely assign one of the index with one of the extreme values in it

if(outside == TRUE)
	{
	if((max(x) <= xval) || (min(x) >= xval))
		{
		sorx <- sort(x)
		if(abs(sorx[1] - xval) < abs(sorx[length(sorx)]- xval))
			{index <- 1; vally <- sorx[index]; index <- which(x1 == vally); return(index)}

		if(abs(sorx[1] - xval) > abs(sorx[length(sorx)]- xval))
			{index <- length(sorx); vally <- sorx[index]; index <- which(x1 == vally); return(index)}
		}
	}


# for most cases in which the value falls within the vector find the nearest
# value and assign that index to the return value
sorx <- sort(x)
upp <- which(sorx >= xval)[1]
low <- which(sorx <= xval); low <- low[length(low)]
upp <- sorx[upp]; low <- sorx[low]

if(upp == low) {index <- which(x == upp)}
if((abs(upp - xval)) >= (abs(low - xval))) {index <- which(x1 == low)}
if((abs(upp - xval)) < (abs(low - xval))) {index <- which(x1 == upp)}

return(index)
}


# per.r find the value of the ith percentage point of a probability distribution
# by assuming that if the vals has the same number of elements as the density then
# these must correspond to an approximation for bin centres and interpolates by
# splitting the difference between the centres to find bin extremes
# if the vals vector is and element greater than the density vector then it assumes
# the vals vector represents an approximation to the bin extremes so leaves them alone
# NAs are allowed in the density - if na.rm is TRUE then it assumes that they're really
# zero values - if na.rm is FALSE then it fails with warning
# NAs are not allowed in the vals vector at all - plus there can be no repeated values
# in it, and they MUST be in ascending order - contravene any of these and the function
# will fail with warning
# normally the density cannot have negative values, but if neg.rm is set to TRUE
# then all negative values from the density will be assumed to be zero - this
# occasionally happens from various kde functions

per <- function(den, vals, point, na.rm=FALSE, neg.rm=FALSE)
{
# handle various NA actions
# fail if they appear at all
if(na.rm == FALSE){na.fail(den); na.fail(vals)}
# assume NA values in a pdf are really zero - fail NAs from ordinates
if(na.rm == TRUE){den[which(den == "NA")] <- 0; na.fail(vals)}

lenden <- length(den)
lenvals <- length(vals)

# check we have the vectors of the right lengths approximate to bin centres or bin limits
if((lenvals < lenden) || (lenvals > (lenden + 1))){stop("mismatch in vector lengths")}
# test to see whether there are any repeated ordinate values
if(length(unique(vals)) != lenvals){stop("repeated values in ordinates")}
# make sure the ordinates are arranged in ascending order
if(length(unique(vals == sort(vals))) == 2){stop("ordinate values not in ascending order")}

# negative density values
# fail if there are any density values less than zero
if(neg.rm == FALSE){if(any(den < 0)){stop("negative values in the density vector")}}
# convert all negative values of the density to zero if required behaviour
if(neg.rm == TRUE){den[which(den < 0)] <- 0}


# if vals is a vector approximating bin centres calculate the bin extremes
	if(lenden == lenvals)
		{
		extremes <- rep(0, lenden + 1)
		outty <- .C("getlims", as.double(vals), as.double(extremes), as.integer(lenden))
		# reassign the bin extreme vector
		extremes <- outty[[2]]
		}

# if vals is a vector approximating bin extremes then set up the extremes vector
if(lenvals == (lenden + 1)){extremes <- vals}
		
#####################################################################################

lenextremes <- length(extremes)

# calculate vectors of extreme values so a diffrence relating to each
# value of the density may be calculated
extremes1 <- extremes[1:(lenextremes - 1)]
extremes2 <- extremes[2:lenextremes]
# use the difference to calculate the integrated area for each bin
volumes <- (abs(extremes2 - extremes1)) * den
# can now legitimately normalise the volumes
sumvolumes <- sum(volumes)
volumes <- volumes / sumvolumes

# find the first bin for which the integrated area exceeds that of the
# requested point
culvolumes <- cumsum(volumes)
bin <- which(culvolumes > point)[1]

# calculate the distance across the bin which the integrated area
# equals the point which specified by point
# notice special case for if the first bin is the one in which
# the point value occurs
if(bin == 1){prop <- point / culvolumes[bin]}
if(bin > 1){prop <- (point - culvolumes[bin-1]) / (culvolumes[bin] - culvolumes[bin -1])}

# prop cannot be greater then 1 - if it is then the algorthm is duff
if(abs(prop) > 1){stop("Bugger! - duff code again - notify developers")}

# work out the interpolated value as prop * distance between the two bin
# extremes
xvalue <- extremes[bin] + ((extremes[bin + 1] - extremes[bin]) * prop)
return(xvalue)
}

# message so users don't get worried by the KernSmooth message when
# loading GenKern

cat("\nLoading GenKern version 1.03\n")
cat("Copyright Lucy and Aykroyd 2000\n")
cat("last update August 2001\n")
cat("requires KernSmooth\n\n")

# required packages -  Wand and Jones' KernSmooth package mainly for 
# the dpik() function for the default h's, but it's useful to have anyway
require(KernSmooth)

cat("\nPackage GenKern installed\n\n")

# load up the c module   
.First.lib <- function(lib, pkg) library.dynam("GenKern", pkg, lib)
# library.dynam("libCorKern")


