.packageName <- "classifly"
# Generate classification data.
# Given a model, this function generates points within
# the range of the data, classifies them, and attempts to locate boundaries 
# by looking at advantage.
# 
# If posterior probabilities of classification are available, then the
# \code{\link{advantage}} will be calculated directly.  If not, \code{\link{knn}}
# is used calculate the advantage based on the number of neighbouring points
# that share the same classification.  Because knn is $O(n^2)$ this method is rather 
# slow for large (>20,000 say) data sets. 
# 
# By default, the boundary points are identified 
# as those below the 5th-percentile for advantage.  
# 
# @arguments classification model
# @arguments data set used in model
# @arguments number of points to generate
# @arguments method to use, currently either grid (an evenly spaced grid), random (uniform random distribution across cube), or nonaligned (grid + some random peturbationb)
# @returns data.frame of classified data
# @keyword datagen 
generate_classification_data <- function(model, data, n, method, advantage) {
	v <- variables(model)
	
	df <- generate_data(data[, v$predictors, drop=FALSE], n=n, method=method)
	post <- posterior(model, df)
	
	df[[".ADVANTAGE"]] <- NA
	if (is.null(post)) {
		df[[v$response]] <- classify(model, df)
		if (advantage) {
			v <- variables(model)
			pred <- rescaler(df[, v$predictors], type="range")
			a <- knn(pred, pred, df[,v$response], prob=T, k=5)

			df[[".ADVANTAGE"]] <- attr(a, "prob")
			
		}
	} else {
		df[[v$response]] <- factor(max.col(post), levels=1:ncol(post), labels=colnames(post))
		if (advantage) df[[".ADVANTAGE"]] <- advantage(post)
		df <- cbind(df, post)
	}
	df[[".TYPE"]] <- factor("simulated")
	
	df	
}

# Classify
# Common interface to extract predict classification from a variety of classification objects
# 
# If the classification method can produce a matrix of posterior
# probabilities (see \code{\link{posterior}}), then that will be used to 
# calculate the \code{\link{advantage}}.  Otherwise, the classify method
# will be used and the advantage calculated using a k-nearest neighbours
# approach.
# 
# @arguments model object
# @arguments data set used in model
# @alias classify.rpart
# @keyword internal
classify <- function(model, data) UseMethod("classify", model)
classify.rpart <- function(model, data, ...)  predict(model, data, type="class")

# Posterior
# Common interface to extract posterior group probabilities
# 
# Every classification method seems to provide a slighly different 
# way of retrieving the posterior probability of group membership.  This 
# function provides a common interface to all of them
# 
# @arguments model object
# @arguments data set used in model
# @alias posterior.lda
# @alias posterior.qda
# @alias posterior.randomForest
# @alias posterior.nnet
# @alias posterior.svm
# @alias posterior.glm
# @alias posterior.default
# @keyword internal
posterior <- function(model, data) UseMethod("posterior", model)
posterior.default <- function(model, data) NULL
posterior.qda <- posterior.lda <- function(model, data) predict(model, data)$posterior
posterior.randomForest <- function(model, data) predict(model, data, type="prob")
posterior.svm <- function(model, data) attr(predict(model, data, probability = TRUE), "probabilities")
posterior.nnet <- function(model, data) {
	probs <- predict(model, data)
	cbind(probs, 1 - rowSums(probs))
}
posterior.glm <- function(model, data) {
	probs <- predict(model, data, type="response")
	probs <- cbind(probs, 1 - probs)
	colnames(probs) <- levels(model$model[[variables(model)$response]])
	probs
}

# Advantage
# Calculate the advantage the most likely class has over the next most likely.
# 
# This is used to identify the boundaries between classification regions.
# Points with low (close to 0) advantage are likely to be near boundaries.
# 
# @arguments matrix of posterior probabilities
# @keyword classif
advantage <- function(post) {
	apply(post, 1, function(x) -diff(x[order(x, decreasing=TRUE)[1:2]]))
}

# Variables
# Extract predictor and response variables for a model object.
# 
# Due to the way that most model objects are stored, you
# also need to supply the data set you used with the original
# data set.  It currently doesn't support model fitted without
# using a data argument. 
# 
# @alias variables.default
# @arguments model object
# @returns response variable
# @returns predictor variables
# @keyword attribute 
variables <- function(model) UseMethod("variables", model)
variables.default <- function(model) {
	list(
		response = all.vars(model$terms[[2]]),
		predictors = all.vars(model$terms[[3]])
	)	
}
# Simulate variable.
# Simulate observations from a vector
# 
# Given a vector of data this function will simulate
# data that could have come from that vector. 
# 
# There are three methods to choose from:
# 
# \itemize{
# 	\item nonaligned (default): grid + some random peturbation
# 	\item grid: grid of evenly spaced observations.  If a factor, 
# 		all levels in a factor will be used, regardless of n
# 	\item random: a random uniform sample from the range of the variable
# }
# 
# @arguments data vector
# @arguments desired number of points (will not always be achieved)
# @arguments simulation method
# @alias simvar.factor
# @alias simvar.numeric
# @keyword datagen 
simvar <- function(x, n=10, method="grid") UseMethod("simvar")
simvar.factor <-  function(x, n=10, method="grid") {
	switch(method,
		random = x[sample(length(x), n, replace=TRUE)],
		factor(levels(x), levels=levels(x)),
	)
}

simvar.numeric <- function(x, n=10, method="grid") {
	rng <- range(x)
	switch(method,
		random = runif(n, rng[1], rng[2]),
		seq(rng[1], rng[2], length=n),
	)
} 

# Generate data
# Generate new data from a data frame.
# 
# This method generates new data that fills the range of 
# the supplied datasets.
# 
# @arguments data frame
# @arguments desired number of new observations
# @arguments method to use, see \code{\link{simvar}}
# @keyword datagen 
generate_data <- function(data, n=10000, method="grid") {
	if (method != "random") {
		n <- floor(n ^ (1/ncol(data)))
		df <- data.frame(expand.grid(lapply(data, simvar, n=n, method="grid")))
		if (method == "nonaligned") {
			cont <- !sapply(df, is.factor)
			ranges <- lapply(df[,cont], function(x) diff(range(x)))
			df[,cont] <- df[,cont] + do.call(cbind, lapply(ranges, function(rng) runif(-rng/(2*n), rng/(2*n), n=nrow(df))))
		}
		df
	} else {
		data.frame(sapply(data, simvar, n=n, method=method))
	}
}
# Classify and explore a data set
# Classifly provides a convenient method to fit a classification function
# and then explore the results in the original high dimensional space.
# 
# This is a convenient function to fit a classification function and
# then explore the results using GGobi.  You can also do this in two
# separate steps using the classification function and then \code{\link{explore}}.
# 
# By default in GGobi, points that are not on the boundary (ie. that have an
# advantage greater than the 5% percentile) are hidden.  To show them, switch
# to brush mode and choose include shadowed points from the brush menu on
# the plot window.  You can then brush them yourself to explore how the 
# certainty of classification varies throughout the space
# 
# Special notes:
# 
# \itemize{
# 	\item You should make sure the response variable is a factor
# 	\item For SVM, make sure to include \code{probability = TRUE} in the arguments to \code{classifly}
# 
# }
# 
# @arguments Data set use for classification
# @arguments Classification formula, usually of the form response ~ predictors
# @arguments Function to use for the classification, eg \code{\link[MASS]{lda}}
# @arguments Other arguments passed to classification function.  For example. if you use \code{\link[e1071]{svm}} you need to use \code{probabiltiy = TRUE} so that posterior probabilities can be retrieved.
# @arguments Number of points to simulate.  To maintain the illusion of a filled solid this needs to increase with dimension.  10,000 points seems adequate for up to four of five dimensions, but if you have more predictors than that, you will need to increase this number.
# @arguments method to simulate points: grid, random or nonaligned (default).  See \code{\link{simvar}} for more details on the methods used.
# @arguments type of scaling to apply to data.  Defaults to commmon range.  See \code{\link[reshape]{rescaler}} for more details.
# @alias package-classifly
# @seealso \code{\link{explore}}, \url{http://had.co.nz/classifly}
# @keyword dynamic 
#X classifly(kyphosis, Kyphosis ~ . , lda)
#X classifly(kyphosis, Kyphosis ~ poly(Age,2) + poly(Number,2) + poly(Start,2) , lda)
#X classifly(kyphosis, Kyphosis ~ . , qda)
#X classifly(kyphosis, Kyphosis ~ . , rpart)
#X classifly(kyphosis, Kyphosis ~ . , knnf, k=3)
#X classifly(kyphosis, Kyphosis ~ . , glm, family="binomial")
#X 
#X classifly(kyphosis, Kyphosis ~ . , svm, probability=TRUE)
#X classifly(kyphosis, Kyphosis ~ . , svm, probability=TRUE, kernel="linear")
#X classifly(kyphosis, Kyphosis ~ . , best.svm, probability=TRUE, kernel="linear")
#X
#X #Also can use explore directorly
#X bsvm <- best.svm(Species~., data = iris, gamma = 2^(-1:1), cost = 2^(2:+ 4), probability=TRUE)
#X explore(bsvm, iris)
classifly <- function(data, model, classifier, ..., n=10000, method="nonaligned", type="range") {
  data <- rescaler(data, type=type)
	classifly <- classifier(model, data=data, ...)
	explore(classifly, data, n=n, method=method, advantage=TRUE)
}

# Explore default
# Default method for exploring objects
# 
# The default method currently works for classification
# functions.
# 
# It generates a data set filling the design space, finds 
# class boundaries (if desired) and then displays in a new 
# ggobi instance.
# 
# @arguments classification object
# @arguments data set used with classifier
# @arguments number of points to generate when searching for boundaries
# @arguments method to generate points, see \code{\link{generate_data}}
# @arguments only display boundaries
# @keyword dynamic 
# @seealso \code{\link{generate_classification_data}}, \url{http://had.co.nz/classifly}
# @returns A \code{\link{invisible}} data frame of class \code{classifly} that contains all the simulated and true data.  This can be saved and then printed later to open with rggobi.
#X bsvm <- best.svm(Species~., data = iris, gamma = 2^(-1:1), cost = 2^(2:+ 4), probability=TRUE)
#X explore(bsvm, iris)
explore <- function(model, data, n=10000, method="nonaligned", advantage=TRUE, ...) {
	v <- variables(model)
	grid <- generate_classification_data(model, data, n=n, method=method, advantage=TRUE)
	actual <- data[,c(v$predictor, v$response)]
	actual[[".TYPE"]] <- factor("actual")
	
	data <- rbind.fill(grid, actual)
	class(data) <- c("classifly", class(data))
	attr(data, "variables") <- v
	data
}


# Print classifly object
# Opens with ggobi
# 
# @keyword internal 
print.classifly <- function(x, ...) {
	v <- attr(x, "variables")
	g <- ggobi(x)
	
	d <- g[1]
	glyph_colour(d) <- as.numeric(x[[v$response]]) + 1
	glyph_type(d) <- ifelse(x[[".TYPE"]] == "simulated", 1, 6)
	excluded(d) <- !is.na(x[[".ADVANTAGE"]]) & x[[".ADVANTAGE"]] > quantile(x[[".ADVANTAGE"]], 0.1, na.rm=TRUE)
	invisible(d)	
}
# knn, with formula
# A wrapper function for \code{\link[class]{knn}} to allow use
# with classifly.
# 
# @arguments classification formula
# @arguments training data set
# @arguments number of neighbours to use
# @keyword classif
# @alias classify.knnf
knnf <- function(formula, data, k=2) {
	structure(list(terms=terms(formula, data=data), data=data, k=k), class="knnf")
}

classify.knnf <- function(model, data, ...) {
	v <- variables(model)
	knn(model$data[,v$predictors], data[,v$predictors], model$data[, v$response], k=model$k)
}
