% -*- mode: noweb; noweb-default-code-mode: R-mode; -*-
%\VignetteIndexEntry{Mathematical Details of Functions in dplR}
%\VignetteEngine{knitr::knitr}
%
% Using \Vignette* directives above as if this was a regular vignette.
% However, processing the document takes a long time (~ 50 minutes on
% an Intel i5-3470 CPU) and an external non-free program (MATLAB) is
% suggested (see below).  Therefore a static PDF is provided instead
% of compiling the document every time when R CMD build or R CMD check
% is run on the package.
%
% Build instructions (or go straight to build-math-dplR.R)
%
% 0. Rename this file to math-dplR.Rnw
%
% 1. Compile ("knit") with knitr (R prompt).
%   library(knitr)
%   knit("math-dplR.Rnw", encoding = "UTF-8")
%   purl("math-dplR.Rnw") # optional, extracts R code
% NOTE that files will be created in the current working directory and
% its subdirectories "cache" and "figures".
%
% 2. Compile the file to .pdf (R prompt).
%   tools::texi2pdf("math-dplR.tex")
%
% Additionally, the .pdf file may be compacted by running
%   tools::compactPDF("math-dplR.pdf", gs_quality = "ebook")
%
% Requirements
%
% 1. For knitting:
% - R packages "dichromat", "dplR" (this package), "knitr" and "tikzDevice"
% - R packages "graphics" and "stats" which should always be available
% - Suggested: MATLAB with Curve Fitting Toolbox (system() call to "matlab")
%
% NOTE: MATLAB is used for checking the equivalence of results from
% ffcsaps in dplR and csaps in MATLAB. If TRY_MATLAB (below) is FALSE,
% the comparison will be skipped which will be noted in the
% document. A flip of coin (any change) is needed in the
% "flip-to-retry" chunk below to invalidate the cached (possibly
% failed / skipped) result of the comparison.
%
% 2. For LaTeXing
% - a modern TeX distribution, e.g. TeX Live.
% - required packages are listed below (\usepackage). The xcolor
%   package is also needed. The required packages also have other
%   dependencies which increases the total number of packages
%   required. A TeX Live installation, for example, should have all of
%   the required packages.

\documentclass[a4paper]{article}

<<"try-matlab", echo=FALSE, results="hide">>=
TRY_MATLAB <- TRUE
@ 
<<"flip-to-retry", echo=FALSE, results="hide", cache=TRUE>>=
"tails"
@ 
<<"packages", echo=FALSE, results="hide">>=
library(dplR) # latexify(), latexDate()
latexify2 <- function(x) latexify(x, doublebackslash = FALSE)
library(dichromat)
library(graphics)
library(stats)
@ 

\usepackage[T1]{fontenc}
\usepackage{textcomp}
\usepackage[utf8]{inputenx}
\usepackage[english]{babel}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{hyperref}
\usepackage{tikz}
\usetikzlibrary{shapes.misc,patterns,decorations.pathreplacing}
\hypersetup{
  pdfauthor = {Mikko Korpela},
}
\makeatletter
\AtBeginDocument{
  \hypersetup{
    pdftitle = {\@title},
    pdfsubject = {Dendrochronology Program Library in R},
    pdfkeywords = {dendrochronology, dplR, R, Gini, Spline},
  }
}
\makeatother

\title{Mathematical Details of Functions in dplR}
\author{Mikko Korpela}
\date{\small Processed in \Sexpr{latexify2(R.version.string)} on
  \Sexpr{latexDate()}}

\begin{document}
\maketitle

% This initialization chunk is probably not interesting for people
% extracting the R code from the document. Therefore purl=FALSE.
<<"knitr-init", echo=FALSE, cache=FALSE, purl=FALSE>>=
## Use xcolor instead of color.
## This kludge removes the following warning:
##   Package xcolor Warning: Incompatible color definition on input line xx.
## where xx is a line number of the .tex file produced by knitr.
## Another solution is to put something like
##   \definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345}
## after the line where tikz and xcolor (loaded by tikz) are loaded.
## This substitution of color with xcolor (although a bit ugly)
## looks neater to me.
knit_hooks$set(document = function(x) {
  sub("(\\usepackage(\\[[^]]*\\])?)\\{color\\}", "\\1{xcolor}", x)
})
opts_chunk$set(cache = 2)
opts_chunk$set(echo = FALSE)
opts_chunk$set(cache.path = "cache/") # default (at the time of writing)
@ 

% Figure sizes and device used may be interesting. Therefore we keep
% the default (TRUE) value of the purl option. Compare to the chunk
% above where purl=FALSE.
<<"knitr-init-fig", echo=FALSE, cache=FALSE>>=
PAGE_WIDTH <- 4.74
PAGE_HEIGHT <- 8.22
opts_template$set(myfigures=list(fig.path = "figure/", fig.pos = "tbp",
                  fig.align = "center", fig.lp = "fig:", dev = "tikz"))
@ 

<<"response-comp-init">>=
## Helper function used in ffcsaps2
inc <- function(from, to) {
    if (is.numeric(to) && is.numeric(from) && to >= from) {
        seq(from=from, to=to)
    } else {
        integer(length=0)
    }
}
## Copied from ffcsaps() in dplR/R/ffcsaps.R,
## with the following additions:
## - As an alternative to nyrs and f, smoothing parameter p
##   can be directly specified as an argument to the function
## - altP = TRUE activates a different (incompatible) formula
##   for computing p as a function of nyrs and f
ffcsaps2 <- function(y, x=seq_along(y), nyrs=length(y)/2, f=0.5,
                     p, altP = FALSE) {
### support functions
    ffppual <- function(breaks, c1, c2, c3, c4, x, left){
        if (left){
            ix <- order(x)
            x2 <- x[ix]
        } else{
            x2 <- x
        }

        n.breaks <- length(breaks)
        if (left) {
            ## index[i] is maximum of a and b:
            ## a) number of elements in 'breaks[-n.breaks]' that are
            ##    less than or equal to x2[i],
            ## b) 1
            index <- pmax(ffsorted(breaks[-n.breaks], x2), 1)
        } else {
            ## index[i] is:
            ## 1 + number of elements in 'breaks[-1]' that are
            ## less than x2[i]
            index <- ffsorted2(breaks[-1], x2)
        }

        x2 <- x2 - breaks[index]
        v <- x2 * (x2 * (x2 * c1[index] + c2[index]) + c3[index]) + c4[index]

        if (left)
            v[ix] <- v
        v
    }

    ffsorted <- function(meshsites, sites) {
        index <- order(c(meshsites, sites))
        which(index > length(meshsites)) - seq_along(sites)
    }

    ffsorted2 <- function(meshsites, sites) {
        index <- order(c(sites, meshsites))
        which(index <= length(sites)) - seq(from=0, to=length(sites)-1)
    }

    ## Creates a sparse matrix A of size n x n.
    ## The columns of B are set to the diagonals of A so that column k
    ## becomes the diagonal in position d[k] relative to the main
    ## diagonal (zero d[k] is the main diagonal, positive d[k] is
    ## above, negative is below the main diagonal).
    ## A value on column j in A comes from row j in B.
    ## This is similar in function to spdiags(B, d, n, n) in MATLAB.
    spdiags <- function(B, d, n) {
        n.d <- length(d)
        A <- matrix(0, n.d * n, 3)
        count <- 0
        for(k in seq_len(n.d)){
            this.diag <- d[k]
            i <- inc(max(1, 1 - this.diag), min(n, n - this.diag)) # row
            n.i <- length(i)
            if(n.i > 0){
                j <- i + this.diag                                 # column
                row.idx <- seq(from=count+1, by=1, length.out=n.i)
                A[row.idx, 1] <- i
                A[row.idx, 2] <- j
                A[row.idx, 3] <- B[j, k]
                count <- count + n.i
            }
        }
        A <- A[A[, 3] != 0, , drop=FALSE]
        A[order(A[, 2], A[, 1]), , drop=FALSE]
    }

### start main function

    y2 <- as.numeric(y)
    ## If as.numeric() does not signal an error, it is unlikely that
    ## the result would not be numeric, but...
    if(!is.numeric(y2)) stop("'y' must be coercible to a numeric vector")
    x2 <- as.numeric(x)
    if(!is.numeric(x2)) stop("'x' must be coercible to a numeric vector")

    n <- length(x2)
    ## quick error check
    if (n < 3) stop("there must be at least 3 data points")
    if (missing(p)) {
        if(!is.numeric(f) || length(f) != 1 || f < 0 || f > 1)
            stop("'f' must be a number between 0 and 1")
        if(!is.numeric(nyrs) || length(nyrs) != 1 || nyrs <= 1)
            stop("'nyrs' must be a number greater than 1")
    }

    ix <- order(x2)
    zz1 <- n - 1
    xi <- x2[ix]
    zz2 <- n - 2
    diff.xi <- diff(xi)

    ## quick error check
    if (any(diff.xi == 0)) stop("the data abscissae must be distinct")

    yn <- length(y2)

    ## quick error check
    if (n != yn)
        stop("abscissa and ordinate vector must be of the same length")

    arg2 <- -1:1
    odx <- 1 / diff.xi
    R <- spdiags(cbind(c(diff.xi[-c(1, zz1)], 0),
                       2 * (diff.xi[-1] + diff.xi[-zz1]),
                       c(0, diff.xi[-c(1, 2)])),
                 arg2, zz2)
    R2 <- spdiags(cbind(c(odx[-zz1], 0, 0),
                        c(0, -(odx[-1] + odx[-zz1]), 0),
                        c(0, 0, odx[-1])),
                  arg2, n)
    R2[, 1] <- R2[, 1] - 1
    forR <- matrix(0, zz2, zz2)
    forR2 <- matrix(0, zz2, n)
    forR[R[, 1] + (R[, 2] - 1) * zz2] <- R[, 3]
    forR2[R2[, 1] + (R2[, 2] - 1) * zz2] <- R2[, 3]
    if (!missing(p)) {
        ## NEW: give value of p directly as an argument
        p.inv <- 1 / p
    } else if (altP) {
        ## NEW: what if the value of p was computed with the formula
        ## from Cook and Kairiukstis (1990).
        p.inv <- (1 - f) * (cos(2 * pi / nyrs) + 2) /
            (6 * (cos(2 * pi / nyrs) - 1) ^ 2) / f
        p <- 1 / p.inv
    } else {
        ## The following order of operations was tested to be relatively
        ## accurate across a wide range of f and nyrs
        p.inv <- (1 - f) * (cos(2 * pi / nyrs) + 2) /
            (12 * (cos(2 * pi / nyrs) - 1) ^ 2) / f + 1
        p <- 1 / p.inv
    }
    yi <- y2[ix]
    mplier <- 6 - 6 / p.inv # slightly more accurate than 6*(1-1/p.inv)
    ## forR*p is faster than forR/p.inv, and a quick test didn't
    ## show any difference in the final spline
    u <- solve(mplier * tcrossprod(forR2) + forR * p,
               diff(diff(yi) / diff.xi))
    yi <- yi - mplier * diff(c(0, diff(c(0, u, 0)) / diff.xi, 0))
    test0 <- xi[-c(1, n)]
    c3 <- c(0, u / p.inv, 0)
    x3 <- c(test0, seq(from=xi[1], to=xi[n], length = 101))
    cc.1 <- diff(c3) / diff.xi
    cc.2 <- 3 * c3[-n]
    cc.3 <- diff(yi) / diff.xi - diff.xi * (2 * c3[-n] + c3[-1])
    cc.4 <- yi[-n]
    to.sort <- c(test0, x3)
    ix.final <- order(to.sort)
    sorted.final <- to.sort[ix.final]
    tmp <-
        unique(data.frame(sorted.final,
                          c(ffppual(xi, cc.1,cc.2,cc.3,cc.4, test0, FALSE),
                            ffppual(xi, cc.1,cc.2,cc.3,cc.4, x3, TRUE))[ix.final]))
    ## get spline on the right timescale - kludgy
    tmp2 <- tmp
    tmp2[[1]] <- round(tmp2[[1]], 5) # tries to deal with identical() issues
    res <- tmp2[[2]][tmp2[[1]] %in% x2]
    ## deals with identical() issues via linear approx
    if(length(res) != n)
        res <- approx(x=tmp[[1]], y=tmp[[2]], xout=x2)$y
    res
}
@ 

<<"response-init">>=
##  Cook, E. R. and Kairiukstis, L. A. (1990) Methods of
##  Dendrochronology: Applications in the Environmental Sciences.
##  Cook, E. R. and Peters, K. (1981) The smoothing spline: a new
##  approach to standardizing forest interior tree-ring width series
##  for dendroclimatic studies
##  (altP = TRUE)
pCook <- function(nyrs, f = 0.5) {
    p.inv <- (1 - f) * (cos(2 * pi / nyrs) + 2) /
        (6 * (cos(2 * pi / nyrs) - 1) ^ 2) / f
    p <- 1 / p.inv
    p
}
## Frequency response according to Cook and Kairiukstis (citing Cook
## and Peters)
respCook <- function(f, p) {
    pif2 <- 2 * pi * f
    1 - 1 / (1 + (p * (cos(pif2) + 2)) / (6 * (cos(pif2) - 1)^2))
}
@ 

\section{Introduction}

This document presents mathematical details about the Dendrochronology
Program Library in R (dplR)~\cite{Bunn2008115,Bunn2010251} which is an
add-on package for R~\cite{Rman}.  Section~\ref{sec:ffcsaps} deals
with the spline smoothing function \texttt{ffcsaps} whereas
section~\ref{sec:gini.coef} covers the computation of Gini
coefficients in \texttt{gini.coef}.

The original implementations of the functions covered here were not
written by the author of this document.  Therefore the functions were
analyzed with a reverse engineering approach.  At the time of writing,
dplR was at version 1.6.0.  Although any changes affecting the
mathematical details of the functions are unlikely, the reader is
advised to check that the document file originated from a current
version of dplR.

\section{Spline Smoothing Parameters in ffcsaps}
\label{sec:ffcsaps}

The \texttt{ffcsaps} function fits a cubic smoothing spline to a given
data vector.  In the manual (Rd file) of the function~\cite{dplRman},
it is stated that the frequency response of the spline is \texttt{f}
at a wavelength (period) of \texttt{nyrs} years\footnote{assuming that
  the sampling rate is once per year}, where these two are parameters
of the function.  We aim to clarify how they relate to the single
smoothing parameter of the spline and what that parameter stands for.
The smoothing parameter is denoted by \texttt{p} in the source code of
the function.

The manual of the \texttt{ffcsaps} function
cites~\cite{cook1990methods}. On page 111, they give the following
frequency (amplitude) response function for the spline:
\begin{equation}
\label{eq:response}
u(f)=1-\frac{1}{1 + \frac{p(\cos (2\pi f) +2)}{6(\cos (2\pi f) -1)^2}}
\quad ,
\end{equation}
where \(f\) is frequency and \(p\) is stated to be the Lagrange
multiplier of the spline, the single parameter that determines the
frequency response.  However, the exact definition of the optimization
problem is absent.  Neither is it given in~\cite{cook1981smoothing},
the reference used by~\cite{cook1990methods}.  I did not find a copy
of~\cite{peters1981cubic} when trying to follow the chain of
references further.

Note that the relationship between frequency and period using mixed
notation of \texttt{ffcsaps} and~\eqref{eq:response} is \(f =
1/\mathtt{nyrs}\).  Setting parameters \texttt{f} and \texttt{nyrs} in
\texttt{ffcsaps} is equivalent to the following directive: set the
smoothing parameter to a value that fulfills \(u(1/\mathtt{nyrs}) =
\mathtt{f}\).  By making the variable substitutions and
rearranging~\eqref{eq:response} we get the following equation for
\(p\):
\begin{equation}
\label{eq:p.book}
p = \frac{6 \mathtt{f} (\cos (2\pi / \mathtt{nyrs}) -1)^2}{(1 -
  \mathtt{f})(\cos (2\pi / \mathtt{nyrs}) +2)}
\end{equation}
or for the inverse of \(p\):
\begin{equation}
\label{eq:pinv.book}
\frac{1}{p} = \frac{(1 - \mathtt{f})(\cos (2\pi / \mathtt{nyrs})
  +2)}{6 \mathtt{f} (\cos (2\pi / \mathtt{nyrs}) -1)^2} \quad .
\end{equation}

The source code of \texttt{ffcsaps} contains code lines that
correspond to the equation
\begin{equation}
\label{eq:pinv.code}
\mathtt{p.inv} = \frac{1}{\mathtt{p}} = \frac{(1 - \mathtt{f})(\cos
  (2\pi / \mathtt{nyrs}) +2)}{12 \mathtt{f} (\cos (2\pi /
  \mathtt{nyrs}) -1)^2} + 1 \quad ,
\end{equation}
where \(\mathtt{p}\) and its inverse \(\mathtt{p.inv}\) are variables
used in the code.  We find that~\eqref{eq:pinv.book}
and~\eqref{eq:pinv.code} are connected by
\begin{equation}
\label{eq:pp.inv}
\frac{1}{p} = 2 \left(\frac{1}{\mathtt{p}} - 1\right)
\end{equation}
or equivalently
\begin{equation}
\label{eq:pp}
\frac{\mathtt{p}}{1 - \mathtt{p}} = 2 p \quad .
\end{equation}

<<"response-comp", message=FALSE, dependson="response-comp-init", cache.vars=c("response1", "response2", "NYRS", "nFreq")>>=
N <- 1536
K <- 500
NYRS <- c(4, 16, 64)
nFreq <- N / 2 + 1
halfseq <- seq_len(nFreq)

ratio1 <- array(NA_real_, c(nFreq, K, length(NYRS)))
ratio2 <- array(NA_real_, c(nFreq, K, length(NYRS)))

if (!exists(".Random.seed", globalenv(), mode="numeric")) {
    foo <- sample(TRUE)
}
seed <- get(".Random.seed", globalenv())
rng <- RNGversion("2.15.0")
set.seed(123)

## Because this takes a long time, progress messages will be printed
updates <- round(c(0.002, 0.02, seq_len(9)/10) * K)
updates <- updates[updates >= 1]
upIdx <- 1
time0 <- Sys.time()
message(sprintf("Starting spline frequency response test at %s",
                format(time0, "%X")))
message("Progress messages will be printed along the way.")
for (k in seq_len(K)) {
    x <- rnorm(N)
    fftx <- abs(fft(x))[halfseq]
    for (j in seq_along(NYRS)) {
        nyrs <- NYRS[j]
        spline1 <- ffcsaps2(x, nyrs = nyrs, altP = FALSE)
        spline2 <- ffcsaps2(x, nyrs = nyrs, altP = TRUE)
        fft1 <- abs(fft(spline1))[halfseq]
        fft2 <- abs(fft(spline2))[halfseq]
        ratio1[, k, j] <- fft1 / fftx
        ratio2[, k, j] <- fft2 / fftx
    }
    if (length(updates) >= upIdx && k == updates[upIdx]) {
        upIdx <- upIdx + 1
        timeNow <- Sys.time()
        timeElapsed <- difftime(timeNow, time0, units = "mins")
        timePerRound <- timeElapsed / k
        roundsLeft <- K - k
        timeLeft <- roundsLeft * timePerRound
        timeAtFinish <- timeNow + timeLeft
        message(sprintf(paste0("%4.1f%% done. ",
                               "Estimated completion at %s (%.0f mins left)"),
                        k / K * 100,
                        format(timeAtFinish, "%X"),
                        as.numeric(timeLeft)))
    }
}
message("Finished.")

RNGkind(rng[1], rng[2])
assign(".Random.seed", seed, globalenv())

response1 <- matrix(NA_real_, nFreq, 3)
response2 <- matrix(NA_real_, nFreq, 3)
colnames(response1) <- NYRS
colnames(response2) <- NYRS
for (j in seq_along(NYRS)) {
    response1[, j] <- rowMeans(ratio1[, , j])
    response2[, j] <- rowMeans(ratio2[, , j])
}
@ 

<<"ffcsaps-caption", cache=FALSE>>=
FFCSAPS_CAPTION <-
    paste("Theoretical frequency response of spline filter vs response",
    "with i.i.d. normal series of 1536 samples (mean of 500 repeats)",
    "using \\texttt{ffcsaps}.  The legend on the bottom panel applies to",
    "all panels.  The blue circles were obtained by",
    "using~\\eqref{eq:pinv.code} for computing (inverse) \\texttt{p} in",
    "\\texttt{ffcsaps}.  The orange crosses show the results",
    "when~\\eqref{eq:pinv.book} is used instead.")
@ 
<<"response", opts.label="myfigures", fig.width=PAGE_WIDTH, fig.height=PAGE_HEIGHT-0.95, fig.cap=FFCSAPS_CAPTION, dependson=c("response-init", "response-comp"), cache.vars=character(0)>>=
op <- par(mfcol = c(3, 1), mgp = c(2, 0.75, 0), mar = par("mar") - 1)

COLOR_1 <- colorschemes$Categorical.12[10]
COLOR_2 <- colorschemes$Categorical.12[2]
COLOR_LINE <- colorschemes$Categorical.12[6]
LWD <- 3
PCH_1 <- 1
PCH_2 <- 4
fftFreq <- seq(from = 0, to = 0.5, length.out = nFreq)
for (j in seq_along(NYRS)) {
    plot(fftFreq, response1[, j], type = "n",
         xlab = "Frequency (1 / year)", ylab = "Amplitude response",
         main = sprintf("\\texttt{nyrs} = %d, \\texttt{f} = 0.5", NYRS[j]))
    points(fftFreq, response2[, j], pch = PCH_2, col = COLOR_2)
    points(fftFreq, response1[, j], pch = PCH_1, col = COLOR_1)
    lines(fftFreq, respCook(fftFreq, pCook(NYRS[j])), col = COLOR_LINE,
          lwd = LWD)
    abline(h = 0.5, lty = "dashed")
    abline(v = 1 / NYRS[j], lty = "dashed")
    text(0.35, 0.5, "50\\% response", pos = 1, offset=1)
    text(1 / NYRS[j], 0.6,
         sprintf("%d yr period", NYRS[j]),
         pos = 4, srt = 90, offset=1)
}
legend("topright", bg = "white",
       legend = c("Simulation (\\texttt{p} from \\texttt{ffcsaps()})",
       "Simulation (\\texttt{p} from Cook and Peters)",
       "Theoretical (Cook and Peters))"),
       col = c(COLOR_1, COLOR_2, COLOR_LINE),
       lty = c(NA, NA, "solid"), pch = c(PCH_1, PCH_2, NA),
       lwd = c(1, 1, LWD))
par(op)
@ 
Figure~\ref{fig:response} shows the results of a test where the
frequency response of \texttt{ffcsaps} (blue circles) is compared to
the theoretical result (green line) obtained using \eqref{eq:response}
and~\eqref{eq:p.book}.  We see that theory meets practice very well,
particularly for low frequencies.  It must be noted that the
theoretical result does not take into account the effect of having a
series of finite length. The orange crosses show what happens if one
pretends that \(p\) and \texttt{p} are the same quantity,
forgetting~\eqref{eq:pp.inv} and~\eqref{eq:pp}.

<<"smoothed-R", dependson="response-comp-init", cache.vars=c("smoothed.R", "y")>>=
if (!exists(".Random.seed", globalenv(), mode="numeric")) {
    foo <- sample(TRUE)
}
seed <- get(".Random.seed", globalenv())
rng <- RNGversion("2.15.0")
set.seed(234)

## Sine wave with added noise
y <- 5 * sin(seq(from = 0, to = 6*pi, length.out = 101)[-101]) + rnorm(100)

RNGkind(rng[1], rng[2])
assign(".Random.seed", seed, globalenv())

## Smoothing parameter used with csaps and ffcsaps modified to accept p
## 0, 0.01, 0.02, ..., 0.98, 0.99, 1
P <- seq(0, 100) / 100

## Columns of the matrices correspond to elements of P
smoothed.R <- matrix(0, length(y), length(P))
for (i in seq_along(P)) {
    smoothed.R[, i] <- ffcsaps2(y, p = P[i])
}
@ 
<<"smoothed-matlab", dependson=c("smoothed-R", "flip-to-retry"), cache.vars=c("matlabValue", "matlabVersion", "smoothed.matlab")>>=
if (isTRUE(TRY_MATLAB)) {
    fnames <- tempfile(pattern=c("a", "b", "c"), fileext=".txt")
    fname1 <- fnames[1]  # input series y
    fname2 <- fnames[2] # smoothed series from MATLAB
    fname3 <- fnames[3] # MATLAB version
    writeLines(as.character(y), fname1)

    ## System call to MATLAB.
    ## Requirement: MATLAB with Curve Fitting Toolbox.
    matlabCall <-
        paste0("matlab -nodisplay -nojvm ",
               shQuote(paste0("-r \"",
                              "x=1:100;",
                              "P=(0:100)/100;",
                              "fname1 = '", fname1, "';",
                              "y=load(fname1);",
                              "Y=zeros(100,101);",
                              "try,",
                              "for i=1:101, Y(:,i) = csaps(x,y,P(i),x); end,",
                              "catch e, exit(1), end;",
                              "fname2 = '", fname2, "';",
                              "fname3 = '", fname3, "';",
                              "save(fname2, 'Y', '-ascii');",
                              "fid=fopen(fname3, 'w', 'n', 'UTF-8');",
                              "fprintf(fid, '%s\\n', version);",
                              "fclose(fid);",
                              "exit\"")))
    matlabValue <-
        system(matlabCall, ignore.stdout = TRUE, ignore.stderr = TRUE)
    if (matlabValue != 0) {
        smoothed.matlab <- NULL
        matlabVersion <- NULL
    } else {
        smoothed.matlab <- as.matrix(read.table(fname2))
        con <- file(fname3, "r", encoding="UTF-8")
        matlabVersion <- readLines(con)
        close(con)
    }
    unlink(fnames)
} else {
    matlabValue <- NULL
    smoothed.matlab <- NULL
    matlabVersion <- "8.3.0.532 (R2014a)" # tested ok on 2014-05-12
}
@ 
<<"R-matlab-compare", cache=FALSE, error=FALSE>>=
if (isTRUE(TRY_MATLAB) && matlabValue == 0) {
    stopifnot(identical(as.numeric(dim(smoothed.R)), c(100, 101)),
              identical(as.numeric(dim(smoothed.matlab)), c(100, 101)))

    ## Compare Matlab and R results with all.equal, one column (value of
    ## smoothing parameter from P) at a time
    allEqual <- logical(101)
    for (i in seq_len(101)) {
        allEqual[i] <- isTRUE(all.equal(smoothed.matlab[, i], smoothed.R[, i]))
    }
    ## A difference in spline smoothing results between dplR and MATLAB
    ## (when results from MATLAB are available) will stop the document
    ## from compiling.
    stopifnot(all(allEqual))
}
@ 

<<"smoothed-caption", cache=FALSE>>=
SMOOTHED_CAPTION <-
    paste("Spline with different values of smoothing parameter",
          "\\texttt{p} fitted to a noisy sine wave")
@ 
<<"smoothed", opts.label="myfigures", fig.width=PAGE_WIDTH, fig.height=PAGE_WIDTH, fig.cap=SMOOTHED_CAPTION, dependson="smoothed-R", cache.vars=character(0)>>=
## Plot the input series and a few output series
COLORS <- c("black", colorschemes$Categorical.12[c(10, 2, 6, 8)])
mar <- par("mar")
mar <- mar - 1.5
mar[1] <- mar[1] - 0.3
mar[3] <- mar[3] + 0.3
op <- par(lwd = 3, mgp = c(2, 0.75, 0), xpd = NA, mar = mar)
plot(smoothed.R[, 101], ylab = "", col = COLORS[1])
LTY <- c("solid", "solid", "dashed", "solid")
lines(smoothed.R[, 91], col = COLORS[2], lty=LTY[1])
lines(smoothed.R[, 51], col = COLORS[3], lty=LTY[2])
lines(smoothed.R[, 11], col = COLORS[4], lty=LTY[3])
lines(smoothed.R[, 1], col = COLORS[5], lty=LTY[4])
usr <- par("usr")
legend(usr[1], usr[4], xjust = 0, yjust = 0, cex = 0.85,
       legend = paste("\\texttt{p} =",
       c("1 (input)", "0.9", "0.5", "0.1", "0")),
       col = COLORS,
       lty = c(NA, LTY),
       pch = c(1, rep.int(NA, 4)), ncol = 3,
       bty = "n")
par(op)
@ 

<<"matlab-version", cache=FALSE>>=
if (isTRUE(TRY_MATLAB) && matlabValue == 0) {
    matlabVersionText <- paste0("(version ", latexify2(matlabVersion), ")")
}
@ 

<<"matlab-note", cache=FALSE, message=FALSE>>=
matlabNoteText <- if (!isTRUE(TRY_MATLAB)) {
    message(paste("Set TRY_MATLAB=TRUE and re-knit the document to repeat the comparison.",
                  "MATLAB with Curve Fitting Toolbox required.", sep="\n"))
    ""
} else if (matlabValue != 0) {
    if (matlabValue == 127) {
        msg <- "MATLAB could not be run."
        LaTeXmsg <- "\\textsc{matlab} could not be run."
    } else if (matlabValue == 1) {
        msg <- "Function csaps in MATLAB could not be run."
        LaTeXmsg <-
            "Function \\texttt{csaps} in \\textsc{matlab} could not be run."
    } else {
        msg <- "Unexpected problem with system(\"matlab...\")."
        LaTeXmsg <- "Unexpected problem with \\texttt{system(\"matlab...\")}."
    }    
    message(msg)
    sprintf(paste0("\\textbf{",
                   "A problem occurred when the document was compiled:",
                   "} \\textcolor{red}{%s}"), LaTeXmsg)
} else {
    "The result was reproduced when this document was compiled."
}
@ 

\textsc{matlab} \Sexpr{matlabVersionText} contains a function called
\texttt{csaps} in the ``Curve Fitting Toolbox''.  The name bears a
resemblance to \texttt{ffcsaps}.  The smoothing parameter of
\texttt{csaps} is called \texttt{p} which makes it a namesake of the
internal variable of \texttt{ffcsaps} derived from the parameters
\texttt{nyrs} and \texttt{f}.  To compare the results of the two
functions, I modified \texttt{ffcsaps} slightly so that it can take
\texttt{p} as an argument and omit \texttt{nyrs} and \texttt{f}.  I
created a segment of a noisy sine wave and smoothed it with both
functions using all values of \texttt{p} in the set \(\{0, 0.01, 0.02,
\dots, 1\}\), covering the whole range of useful
values~\cite{csaps}\footnote{The Mathworks web site openly provides access
  to the documentation of the latest Matlab version.  Older documents
  are available after a login to the website or by running \texttt{doc
    csaps} in the command prompt of a particular Matlab version.  The
  author has checked that the documentation of \texttt{csaps} agrees
  between versions 2012b, 2013a, 2013b, 2014a and 2014b.}.
Using the R function \texttt{all.equal} to compare each pair of
smoothed series, I found that the results of \texttt{csaps} and
\texttt{ffcsaps} always matched.  \Sexpr{matlabNoteText}
Figure~\ref{fig:smoothed} shows the input series and four smoothed
series of the experiment.

According to the documentation of \texttt{csaps}~\cite{csaps}, it fits
the spline by minimizing the following sum, presented here in a
simplified form:
\begin{equation}
\label{eq:sum}
\mathtt{p} \times \text{Error} + (1 - \mathtt{p}) \times
\text{Roughness} \quad .
\end{equation}
Having demonstrated that the results of \texttt{csaps} and
\texttt{ffcsaps} match when using any chosen value of \texttt{p}, we
can safely say that~\eqref{eq:sum} is also the function minimized by
\texttt{ffcsaps}, with the same definitions of Error and Roughness,
details of which are omitted here.  A more direct analysis would
require one to completely reverse engineer the scarcely documented
source code of \texttt{ffcsaps}.  Following from~\eqref{eq:pp}
and~\eqref{eq:sum}, the splines described in~\cite{cook1990methods}
seem to be the result of minimizing
\begin{equation}
\label{eq:cooksum}
2 p \times \text{Error} + \text{Roughness} \quad .
\end{equation}

\section{Formulation of Gini Coefficient in gini.coef}
\label{sec:gini.coef}

The \texttt{gini.coef} function computes the Gini coefficient (Gini
index) of a given data vector.  The manual (Rd file) of the function
has a reference to~\cite{biondi2008inequality} which uses the
following formula for the Gini coefficient (\(G\)):
\begin{equation}
  \label{eq:rmd}
  G = \frac{1}{2 n \sum_{i=1}^{n} x_i} \sum_{i=1}^{n} \sum_{j=1}^{n}
  \left| x_i - x_j \right| \quad .
\end{equation}
In~\eqref{eq:rmd}, the Gini coefficient is defined in terms of
pairwise differences between all pairs of observations (\(x_i\text{,
}i \in 1, \dots, n\)). More specifically, the Gini coefficient is one
half of the relative mean difference, which is defined as the mean of
the absolute pairwise distances divided by the mean of the
observations.

The C source code of the \texttt{gini.coef} function uses the
following formula for the Gini index:
\begin{equation}
  \label{eq:lorenz}
  G = \left(X_n (n - 1) - 2 \sum_{i=1}^{n-1}X_i\right) / (X_n n) \quad ,
\end{equation}
where \(n\) is the number of observations and \(X_i\) is the
\(i\text{:th}\) cumulative sum
\begin{equation}
  \label{eq:cumsum}
  X_i = \sum_{j=1}^{i} x_j
\end{equation}
of sorted observations \(x_j\):
\begin{equation}
  \label{eq:sorted}
  \forall i: i < j \Rightarrow x_i \leq x_j \quad .
\end{equation}
\eqref{eq:lorenz} can be reformulated as
\begin{equation}
  \label{eq:lorenz2}
  G = 1 - \frac{1}{n} - \frac{2}{X_n n} \sum_{i=1}^{n-1}X_i
\end{equation}
or as
\begin{equation}
  \label{eq:lorenz3}
  G = \left(\frac{1}{2} - \left(\frac{1}{2n} + \frac{1}{X_n n}
      \sum_{i=1}^{n-1}X_i\right)\right) / \frac{1}{2} \quad .
\end{equation}
When we assign
\begin{equation}
  \label{eq:AB}
  A + B = \frac{1}{2}
\end{equation}
and
\begin{equation}
  \label{eq:B}
  B = B_1 + B_2 = \frac{1}{2n} + \frac{1}{X_n n} \sum_{i=1}^{n-1}X_i
  \quad ,
\end{equation}
\eqref{eq:lorenz3} becomes
\begin{equation}
  \label{eq:lorenzsymb}
  G = A / (A + B)
\end{equation}
or equivalently
\begin{equation}
  \label{eq:lorenzsymb2}
  G = 1 - 2 B \quad .
\end{equation}

Figure~\ref{fig:lorenz} is a graphical representation of the Gini
coefficient using an example data set of the following six observed
values: \(\{0.2, 0.4, 0.75, 0.95, 1.2, 2.5\}\). It shows the
definition of the Gini coefficient as the ratio of the area above the
Lorenz curve~\cite{lorenz1905} to the total area of the
triangle~\cite{xu2003has}.  The Lorenz curve is defined by the
cumulative distribution function of the empirical probability
distribution of the observations.  The sides of the triangle
corresponding to the axes are normalized to length 1.

Comparing Figure~\ref{fig:lorenz} to \eqref{eq:B}, \(B_2 =
\sum_{i=1}^{n-1}X_i / (X_n n)\) is obviously the sum of the areas of
the \textcolor{cyan}{cyan} bars.  Summing the areas of the
\textcolor{teal}{teal} triangles, we get
\begin{equation}
\label{eq:B1}
\sum_{i=1}^{n}\left( \frac{1}{2} \frac{1}{n} \frac{x_i}{X_n} \right) =
\frac{1}{2 n X_n}\sum_{i=1}^{n} x_i = \frac{1}{2 n} = B_1 \quad .
\end{equation}
Note that \(B_1\) only depends on the number of observations, not on
their values.  From~\eqref{eq:B} and~\eqref{eq:lorenzsymb2} we find
that the value of the Gini coefficient at maximum inequality (winner
takes all) is \(G_{\text{max}}(n)=1 - 1 / n\).  When all observed
values are equal, the Lorenz curve matches the line of equality
(Figure~\ref{fig:lorenz}), and the Gini coefficient is
\(G_{\text{min}}=0\).  We have assumed that all values \(x_i\) are
non-negative.

\begin{figure}
  \centering
  \begin{tikzpicture}
    % define points
    \coordinate (origin) at (0,0);
    \coordinate (A) at (1,0.2);
    \coordinate (B) at (2,0.6);
    \coordinate (C) at (3,1.35);
    \coordinate (D) at (4,2.3);
    \coordinate (E) at (5,3.5);
    \coordinate (F) at (6,6);
    % fills
    \fill[pink] (origin) -- (A) -- (B) -- (C) -- (D) -- (E) -- (F);
    \fill[teal] (origin) -- +(1,0) -- (A);
    \fill[teal] (A) -- +(1,0) -- (B);
    \fill[teal] (B) -- +(1,0) -- (C);
    \fill[teal] (C) -- +(1,0) -- (D);
    \fill[teal] (D) -- +(1,0) -- (E);
    \fill[teal] (E) -- +(1,0) -- (F);
    \fill[cyan] (1,0) -- (A) -- +(1,0) -- (B) -- +(1,0) -- (C) --
    +(1,0) -- (D) -- +(1,0) -- (E) -- +(1,0) -- (6,0);
    % triangle
    \draw (0,0) -- (6,0) -- (6,6) -- cycle;
    % x axis ticks
    \foreach \x in {1,2,3,4,5,6} \draw (\x,-1pt) -- (\x,1pt);
    % draw points
    \foreach \coord in {(A),(B),(C),(D),(E),(F)}
    \draw \coord circle [radius=2pt];
    % draw lines
    \draw (origin) -- (A) -- (B) -- (C) -- (D) -- (E) -- (F);
    \draw (1,0) -- (A) -- +(1,0) -- (B) -- +(1,0) -- (C) -- +(1,0) --
    (D) -- +(1,0) -- (E) -- +(1,0);
    % legend 
    \draw (0.5,5.5) node[fill=pink] {}; \draw(1, 5.5) node {\(A\)};
    \draw (0.5,5) node[fill=teal] {};   \draw(1, 5) node {\(B_1\)};
    \draw (0.5,4.5) node[fill=cyan] {}; \draw(1, 4.5) node {\(B_2\)};
    \draw [decoration={brace},decorate] (1.4,5.2) -- (1.4,4.3);
    \draw(1.8, 4.75) node {\(B\)};
    % axis tick labels
    \draw (0,-7pt) node[above=-6pt] {\(0\)};
    \draw (1,-7pt) node {\(1/6\)};
    \draw (2,-7pt) node {\(2/6\)};
    \draw (3,-7pt) node {\(3/6\)};
    \draw (4,-7pt) node {\(4/6\)};
    \draw (5,-7pt) node {\(5/6\)};
    \draw (6,-7pt) node[above=-6pt] {\(1\)};
    \draw (6cm+7pt,0) node[rotate=90] {\(0\)};
    \draw (6cm+7pt,6) node[rotate=90] {\(1\)};
    % axis titles
    \draw (3,-26pt) node[align=center]
    {Cumulative portion of population\\(ordered from lowest to highest
    value)};
    \draw (6cm+22pt,3) node[align=center,rotate=90]
    {Cumulative sum of values\\divided by total};
    % text for Lorenz curve
    \draw (D)+(-7pt,7pt) node[rotate=47] {Lorenz curve};
    % text for Line of equality
    \draw (3,3)+(-7pt,7pt) node[rotate=45] {Line of equality};
  \end{tikzpicture}
  \caption{Graphical representation of the Gini coefficient based on
    areas defined by the Lorenz curve (\(n=6\)).  See
    equations~\eqref{eq:AB}, \eqref{eq:B}, \eqref{eq:lorenzsymb}
    and~\eqref{eq:lorenzsymb2}. }
  \label{fig:lorenz}
\end{figure}

The equivalence of different definitions of the Gini coefficient is
reviewed in~\cite{xu2003has}.  One of the results shown in the paper
is that the geometric definition~\eqref{eq:lorenzsymb} used by the
\texttt{gini.coef} function is equivalent to the definition based on
the relative mean difference~\eqref{eq:rmd}.  This can be
experimentally verified by comparing the results of the following R
function to those of \texttt{gini.coef}.
<<gini-rmd, echo=TRUE, tidy=FALSE, cache=FALSE>>=
## Gini index is one half of relative mean difference.
## x should not have NA values.
gini.rmd <- function(x) {
    mean(abs(outer(x, x, "-"))) / mean(x) * 0.5
}
@ 

\bibliography{math-dplR}
\bibliographystyle{abbrv}
\end{document}
