.packageName <- "lsa"
### cosine.R
###
### 2005-11-21:
###   * added lazy calculation:
###     calc only below diagonale; diag = 1; add t(co)
###   * sqrt() over crossprod(x) and crossprod(y)
### 2005-11-09:
###   * bugfix cosvecs
###   * integrated cosvecs into cosine by doing type dependant processing
### 2005-08-26:
###   * rewrote cosvecs function to crossprod
### 

cosine <- function( x, y=NULL ) {
    
    if ( is.matrix(x) && is.null(y) ) {
        
        co = array(0,c(ncol(x),ncol(x)))
        f = colnames( x )
        dimnames(co) = list(f,f)
        
        for (i in 2:ncol(x)) {
            for (j in 1:(i-1)) {
                co[i,j] = cosine(x[,i], x[,j])
            }
        }
        co = co + t(co)
        diag(co) = 1
        
        return (as.matrix(co))
        
    } else if ( is.vector(x) && is.vector(y) ) {
        return ( crossprod(x,y) / sqrt( crossprod(x)*crossprod(y) ) )
    } else {
        stop("argument mismatch. Either one matrix or two vectors needed as input.")
    }
    
}
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### dimcalc.r
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### 
### HISTORY
### 
### 2005-11-22
###    * removed dimcalc() function, rewrote dimcalc_* to
###      be generating functions
### 2005-08-11
###    * integrated all three functions into one joint
###      generating function. dimcalc() returns a
###      a function source to the caller which
###      contains only one parameter (the diagonal values)
###      to be executed by the calling (higher-level) function.
###      To call directly, use
###          e.g. dimcalc(method="share", share=0.3)(mydiags)
###          with mydiags being the diagonal values as a vector.
###      The original three functions will stay (maybe not forever).
### 2005-08-26
###    * removed slope / turning point sceletons (to be
###      included later, maybe)
### 2005-08-25
###    * replaced max() with length() in ndocs
###      now if the first factor is already > ndocs
###      dimcalc_ndocs returns 1 not -Inf
### 

dimcalc_share <- function ( share=0.5) {
    
    # return the position with which 50% share of the
    # summed up singular values are reached
    function ( s ) {
        return( max(which(cumsum(s/sum(s))<=share)) + 1 )
    }
    
}

dimcalc_kaiser <- function() {

    # calculate the number of singular values
    # according to the Kaiser-criterium 
    # (take all with s>1).
    function ( s ) {
        return(  max(which(s>=1)) ) 
    }

}

dimcalc_ndocs <- function(ndocs) {
    # return the position where the 
    # summed up factor values for the
    # first time exceed ndocs.

    if (missing(ndocs)) {
        stop("[dimcalc] - parameter ndocs is missing")
    }
    function ( s ) {
        return( length(which(cumsum(s)<=ndocs)) + 1 ) 
    }
}

dimcalc_raw <- function() {
    
    # only for completeness: give back the 
    # maximum number of singular values
    function ( s ) {
        return( length(s) ) 
    }
    
}

### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### lsa.R
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### 
### 2005-11-22:
###    functions renamed to lsa(), as.textmatrix() and fold-in()
###    added warning to lsa(), when 'emtpy' singular values exist
###    chose NOT to replace solve() by 1/...
###    added a routine to as.textmatrix to convert matrices to textmatrices
### 2005-11-08: modified
###    design decision: weighting schemes will not be integrated.
###                     reason: happens before the LSA core operation / is optional
###    design decision: pre-processing also will stay part of textmatrix()
###                     for the same reasons
###    
### 2005-08-29: created.
### 

### lsa (textmatrix, dims) -> LSAspace($u, $v, $d)
### as.textmatrix (LSAspace) -> textmatrix
### fold-in (textmatrix, LSAspace) -> textmatrix

lsa <- function( x, dims=dimcalc_share() ) {
    
    # do the singular value decomposition
    SVD = svd(x)
    
    # if dims is a function, then calculate the number of dims
    if (is.function(dims)) {
        dims = dims(SVD$d)
    }
    if (dims < 2) dims=2
    
    if (any(SVD$d<=sqrt(.Machine$double.eps))) {
        warning("[lsa] - there are singular values which are zero.");
    }
    
    # prepare for returnation
    space = NULL
    space$tk = SVD$u[,1:dims]
    space$dk = SVD$v[,1:dims]
    space$sk = SVD$d[1:dims]
    rownames(space$tk) = rownames(x)
    rownames(space$dk) = colnames(x)
    class(space) = "LSAspace"
    
    # return the LSA space
    return ( space )
    
}

# as.textmatrix: 
# - when given an LSAspace, recalc a textmatrix of 
#   the original format, name it and return it
# - when given a normal matrix, return a textmatrix

as.textmatrix <- function (LSAspace) {
    
    if (inherits(LSAspace,"LSAspace")) {
        
        # convert an lsa-space to a textmatrix
        Y = LSAspace$tk %*% diag(LSAspace$sk) %*% t(LSAspace$dk)
        rownames(Y)=rownames(LSAspace$tk)
        colnames(Y)=rownames(LSAspace$dk)
        class(Y) = "textmatrix"
        environment(Y) = new.env()
        return(Y)
        
    } else if (inherits(LSAspace, "matrix")) {
        
        # convert a matrix to a textmatrix
        class(LSAspace) = "textmatrix"
        environment(LSAspace) = new.env()
        return(LSAspace)
        
    } else {
        stop("[as.textmatrix] - input has to be an LSAspace (or a matrix).")
    }
    
}

fold_in <- function( docvecs, LSAspace ) {
    
    dqs = crossprod( t( crossprod(docvecs,LSAspace$tk) ), solve(diag(LSAspace$sk)) )
    ### alternative: dqs = crossprod( docvecs, crossprod(t(LSAspace$tk), solve(diag(LSAspace$sk))) )
    dtm = crossprod( t( crossprod(t(LSAspace$tk),diag(LSAspace$sk)) ), t(dqs) )
    
    rownames(dtm) = rownames(LSAspace$tk)
    colnames(dtm) = colnames(docvecs)
    
    environment(dtm) = new.env()
    class(dtm) = "textmatrix"
    
    return (dtm)
    
}
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### query.R
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### dependencies: library("RStem")
### 
### 2005-11-12: bugfix for regexp
### 2005-11-08: pseudo_dtm.R renamed to query.R
### 2005-11-08: removed pseudo_svd (now in lsa.R, renamed
###             to foldinLSAspace()
### 2005-11-08: removed pseudo_docs function (can be done
###             with textmatrix, now!)
### 2005-08-25: added "\\[|\\]|\\{|\\}" to gsub
### 

query <- function( qtext, termlist, stemming=FALSE, language="german" ) {
    
    # qtext: string with the query words, whitespace separated
    # termlist: list of allowed terms
    # dtm: original doc-term-matrix (no weighting applied!)
    
    dtm = NULL
    
    q = strsplit( gsub('[[:space:]]|[[:punct:]]+', ' ', tolower(qtext) ), " ")[[1]]
    vec = vector( mode="numeric", length(termlist) )
    for ( word in q ) {
        if (stemming) word = wordStem(word, language=language)
        if (word != "") {
            vec[ match(word,termlist) ] = vec[ match(word,termlist) ] + 1
        }
    }
    
    dtm = as.matrix(vec)
    colnames(dtm) = toupper(qtext)
    rownames(dtm) = termlist
    
    environment(dtm) = new.env()
    class(dtm) = "textmatrix"
    
    return ( dtm )
    
}

### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### textmatrix
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### dependencies: library("RStem")
### 
### 2005-11-22: chose NOT to integrate separator lines (would splash the handling!)
###             changed summary.textmatrix from matrix to vector output
### 2005-11-11: integrated the vocabulary order/sort functions...
### 2005-11-08: added print and summary functions
### 2005-11-08: added vocabulary filter to both functions
### 2005-10-04: added nchar(..., type="chars") to count characters, not bytes
### 2005-08-25: added "\\[|\\]|\\{|\\}" to gsub
### 2005-08-26: renamed dt_triples to textvector and dt_matrix to textmatrix

textvector <- function (file, stemming=FALSE, language="german", minWordLength=2, minDocFreq=1, stopwords=NULL, vocabulary=NULL) {
    
    txt = scan(file, what = "character", quiet = TRUE)
    txt = gsub( "\\.|:|\\(|\\)|\\[|\\]|\\{|\\}|,|;|\\?|-|\\!|\"|\'|\`|\\^|\=|\’|\–|\„|\”|\/", " ", txt)
    txt = gsub("[[:space:]]+", " ", txt)
    txt = tolower(txt)
    txt = unlist(strsplit(txt, " ", fixed=TRUE))
    
    # stopword filtering?
    if (!is.null(stopwords)) txt = txt[!txt %in% stopwords]
    
    # vocabulary filtering?
    if (!is.null(vocabulary)) txt = txt[txt %in% vocabulary]
    
    # tabulate
    tab = sort(table(txt), decreasing = TRUE)
    
    # with threshold minDocFreq
    tab = tab[tab >= minDocFreq]
    
    # wordLength filtering?
    tab = tab[nchar(names(tab), type="chars") >= minWordLength]
    
    # stemming?
    if (stemming) names(tab) = wordStem(names(tab), language=language)
    
    return( data.frame( docs=basename(file), terms = names(tab), Freq = tab, row.names = NULL) )
    
}

textmatrix <- function( mydir, stemming=FALSE, language="german", minWordLength=2, minDocFreq=1, stopwords=NULL, vocabulary=NULL ) {
    
    dummy = lapply( dir(mydir, full.names=TRUE), textvector, stemming, language, minWordLength, minDocFreq, stopwords, vocabulary)
    if (!is.null(vocabulary)) {
        dtm = t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))
        result = matrix(0, nrow=length(vocabulary), ncol=ncol(dtm))
        rownames(result) = vocabulary
        result[rownames(dtm),] = dtm[rownames(dtm),]
        colnames(result) = colnames(dtm)
        dtm = result
        gc()
    } else {
        dtm = t(xtabs(Freq ~ ., data = do.call("rbind", dummy)))
    }
    
    environment(dtm) = new.env()
    class(dtm) = "textmatrix"
    
    return ( dtm )
    
}

print.textmatrix <- function ( x, bag_lines = 12, bag_cols = 10, ... ) {
    
    nc = ncol(x);
    nr = nrow(x);    
    
    if (nc <= (3*bag_cols) && nr <= (3*bag_lines)) {
        
        y = x;
        attr(y,"class") = NULL;
        attr(y,"call") = NULL;
        environment(y) = NULL;
        print.default(y);
        invisible(x);
        
    } else {
        
        redx = matrix(ncol = (3*bag_cols), nrow = (3*bag_lines));
        mid = round(nrow(x)/2)
        midc = round(ncol(x)/2)
        
        # top
        redx[1:bag_lines, 1:bag_cols] = x[1:bag_lines, 1:bag_cols]
        redx[1:bag_lines, (bag_cols+1):(bag_cols+bag_cols)] = x[1:bag_lines, midc:(midc+bag_cols-1)]
        redx[1:bag_lines, (2*bag_cols+1):(3*bag_cols)] = x[1:bag_lines, (ncol(x)-bag_cols+1):ncol(x)]
        
        # mid
        redx[(bag_lines+1):(bag_lines*2), 1:bag_cols] = x[mid:(mid+bag_lines-1), 1:bag_cols]
        redx[(bag_lines+1):(bag_lines*2), (bag_cols+1):(bag_cols+bag_cols)] = x[mid:(mid+bag_lines-1), midc:(midc+bag_cols-1)]
        redx[(bag_lines+1):(bag_lines*2), (2*bag_cols+1):(3*bag_cols)] = x[mid:(mid+bag_lines-1), (ncol(x)-bag_cols+1):ncol(x)]
        
        # bottom
        redx[(bag_lines*2+1):(bag_lines*3), 1:bag_cols] = x[(nrow(x)-bag_lines+1):nrow(x), 1:bag_cols]
        redx[(bag_lines*2+1):(bag_lines*3), (bag_cols+1):(bag_cols+bag_cols)] = x[(nrow(x)-bag_lines+1):nrow(x), midc:(midc+bag_cols-1)]
        redx[(bag_lines*2+1):(bag_lines*3), (2*bag_cols+1):(3*bag_cols)] = x[(nrow(x)-bag_lines+1):nrow(x), (ncol(x)-bag_cols+1):ncol(x)]
                
        # dixnaxes
        rownames(redx) = c( paste(1:bag_lines,rownames(x)[1:bag_lines],sep=". "), paste(mid:(mid+bag_lines-1),rownames(x)[(mid):(mid+bag_lines-1)],sep=". "), paste((nrow(x)-bag_lines+1):nrow(x), rownames(x)[(nrow(x)-bag_lines+1):nrow(x)], sep=". "))
        colnames(redx) = paste("D", c( 1:bag_cols, midc:(midc+bag_cols-1), (ncol(x)-bag_cols+1):ncol(x) ), sep="")
        docnames = paste( colnames(redx), c( colnames(x)[1:bag_cols], colnames(x)[midc:(midc+bag_cols-1)], colnames(x)[(ncol(x)-bag_cols+1):ncol(x)] ), sep=" = ")
        
        ret = NULL
        ret$matrix = round(redx,2);
        ret$legend = docnames;
        
        print(ret);
        invisible(x);
        
    }
    
}

summary.textmatrix <- function ( object, ... ) {
    
    s = vector(mode="numeric", length=5);
    n = vector(mode="character", length=5);
    n[1] = "vocabulary";
    s[1] = length(rownames(object));
    n[2] = "documents";
    s[2] = length(colnames(object));
    n[3] = "freqs not '0'";
    s[3] = length(which(object>0));
    n[4] = "max term length";
    s[4] = max(nchar(rownames(object),type="chars"));
    n[5] = "non-alphanumerics in terms";
    s[5] = length(which(gsub("[[:alnum:]]|[ÄÖÜäöüß]", "", rownames(object)) != ""));
    names(s) = n;
    class(s) = "summary.textmatrix";
    s
    
}

### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### triples.r v0.2
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### 
### 2005-11-22:
###   * changed setTriple warning() to stop(), added more text
### 2005-11-08:
###   * tried to add with(environment(M), { ... })
###     to getTriple, setTriple and delTriple (does not work)
###   * added error handling to setTriple (produces warning
###     if no environment exists (by checking the class,
###     throws an error if input parameters are wrong)
### 
### 2005-08-26:
###   * added garbage collection to delTriple
###   * removed useless create environment from setTriple

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# convert input subject (column names or 
# column positions) to column position
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
getSubjectId <- function(M, subject) {
    if (is.character(subject)) {
        return( which(match(colnames(M),subject)>0) );
    } else if (is.numeric(subject) && !any(subject>ncol(M)) ) {
        return( subject );
    } else return( NULL );
}

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# getTriple: return the value (=object) of the 
# requested triple(s) from the environment variables 
# "triples$S/P/O" the given matrix M. Leave out
# predicate to get all triples of the specified 
# subject. Leave out subject, to get all triples.
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
getTriple <- function(M, subject, predicate) {
    
    if ( exists("triples$S",envir=environment(M)) ) {
        
        if ( ! missing(subject) ) {
            
            spos = which( get("triples$S", envir=environment(M)) == getSubjectId(M,subject) );
            
            if ( ! missing(predicate)) {
                ppos = which( get("triples$P", envir=environment(M))[spos] == tolower(predicate) );
                objects = as.vector(get("triples$O", envir=environment(M))[spos][ppos]);
            } else {
                objects = list( as.vector(get("triples$P", envir=environment(M))[spos]), as.vector(get("triples$O", envir=environment(M))[spos]) );
            }
            
        } else {
            if ( length(get("triples$S",envir=environment(M))) == 0) {
                return( NULL );
            } else {
                return ( list( as.vector(get("triples$S", envir=environment(M))), as.vector(get("triples$P", envir=environment(M))), as.vector(get("triples$O", envir=environment(M))) ) );
            }
        }
        
        if ( length(objects)==0 ) {
            return( NULL );
        } else return( objects );
            
    } else return( NULL ); # if no triples exist -> return NULL
    
}

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# setTriple: enter a new triple into the environment 
# variables "triples$S/P/O" of the given matrix M.
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
setTriple <- function(M, subject, predicate, object) {
    
    # be aware: the environment must already be
    # created outside this function! If you use
    # your own object (not created by textmatrix, 
    # please do so by:
    #    environment(M) = new.env();
    #    class(M) = "textmatrix";
    
    if ( ! inherits(M, "textmatrix") ) {
        stop("[setTriple] - You are using a matrix which has not been generated with the \nlsa package. Therefore, you have to manually add an environment to the \nmatrix you use and set the class to 'textmatrix' to be able to save triples.\nAlternatively, you can use as.textmatrix() to convert a matrix\nto a textmatrix (be aware that it temporarilty\nneeds twice the amount of memory of the input matrix).\n");
    }
    
    # if input is vectors, check if they 
    # have the same number of elements (else break)
    if (length(subject) != length(predicate) || length(predicate)!=length(object) || length(subject)!=length(object) ) {
        stop("[setTriple] - Input vectors are not of the same length!");
    }
    
    if ( ! exists("triples$S",envir=environment(M)) ) {
        
        # if not yet existing, add 'triples$S/P/O' to 
        # environment of M and insert first triple
        assign("triples$S", factor(getSubjectId(M,subject)), envir=environment(M));
        assign("triples$P", factor(tolower(predicate)), envir=environment(M));
        assign("triples$O", factor(object), envir=environment(M));
        
    } else {
        
        if ( !any( is.na( match(getTriple(M, subject, predicate), object)) == FALSE)  ) {
            # triple does not exist, so append
            striples = get("triples$S", envir=environment(M));
            levels(striples) = unique(c(levels(striples), getSubjectId(M,subject)));
            striples[(length(striples)+1):(length(striples)+length(subject))] = getSubjectId(M,subject);
            assign("triples$S", striples, envir=environment(M));
            
            ptriples = get("triples$P", envir=environment(M));
            levels(ptriples) = unique(c(levels(ptriples), tolower(predicate)));
            ptriples[(length(ptriples)+1):(length(ptriples)+length(predicate))] = tolower(predicate);
            assign("triples$P", ptriples, envir=environment(M));
            
            otriples = get("triples$O", envir=environment(M));
            levels(otriples) = unique(c(levels(otriples), object));
            otriples[(length(otriples)+1):(length(otriples)+length(object))] = object;
            assign("triples$O", otriples, envir=environment(M));
        }
        
    } # insert triple(s)
    
} # # setTriple

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# retractTriple: remove specific triple(s) from
# environment of M. Currently not very memory sensitive ;)
# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
delTriple <- function(M, subject, predicate, object) {
    
    # find position
    spos = which( get("triples$S",envir=environment(M)) == getSubjectId(M,subject) )
    ppos = which( get("triples$P",envir=environment(M))[spos] == tolower(predicate) )
    opos = which( get("triples$O",envir=environment(M))[spos][ppos] == object )
    origppos = ppos[opos]
    origspos = spos[origppos]
    
    # retract
    assign("triples$S", get("triples$S",envir=environment(M))[-origspos], envir=environment(M))
    assign("triples$P", get("triples$P",envir=environment(M))[-origspos], envir=environment(M))
    assign("triples$O", get("triples$O", envir=environment(M))[-origspos], envir=environment(M))
    
    # garbage collection
    gc()
        
} # delTriple


### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
### weightings.r
### -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

# -  -  -  -  -  -  -  -  -  -  -  -  
# local weightings

# what the hell ;)
lw_tf <- function(m) {
    return(m)
}

# log'ed termfrequency
lw_logtf <- function(m) {
    return( log(m+1) )
}

# binary termfrequency
lw_bintf <- function(m) {
    return( (m>0)*1 )
}

# -  -  -  -  -  -   -  -  -  -  -  -  
# global weightings: 
# Dumais (1992), same in Nakov (2001)

# normalisation
gw_normalisation <- function(m) {
    return ( 1 / sqrt( rowSums((m*m), na.rm = TRUE) ) )
}

# inverse document frequency
# from Dumais (1992), Nakov (2001) uses log not log2
gw_idf <- function(m) {
    df = rowSums(lw_bintf(m), na.rm=TRUE)
    return ( ( log2(ncol(m)/df) + 1 ) )
}

# global frequency * inverse document frequency
# from Nakov (2001)
gw_gfidf <- function(m) {
    gf = rowSums(m, na.rm = TRUE)
    df = rowSums(lw_bintf(m), na.rm=TRUE)
    return ( gf/df )
}

# real entropy from Shannon (1948)
entropy <- function (m) {
    gf = rowSums(m, na.rm = TRUE)
    p = m / gf
    ndocs = ncol(m)
    # shannon resp. turing (there: "weight of evidence")
    # exception:
    #   iff p=0: 0*log(0) = 0
    #   this is solved by rowSums(..., na.rm=TRUE)
    entropy = - rowSums( (p*log(p)) / log(ndocs), na.rm = TRUE )
    return ( entropy )
}

# entropy as in Dumais(1992), Nakov(2001):
# global weighting = 1 + entropy
gw_entropy <- function(m) {
    return ( (1 + entropy(m)) )
}

