pbat                  package:pbatR                  R Documentation

_P_B_A_T _G_r_a_p_h_i_c_a_l _a_n_d _C_o_m_m_a_n_d _L_i_n_e _I_n_t_e_r_f_a_c_e

_D_e_s_c_r_i_p_t_i_o_n:

     The following routines are for the graphical and command line pbat
     interface.  The command line interfaces are listed in an order of
     suggested usage.  Most users of the command line will only want to
     use 'pbat.m'.

     'pbat' runs a GUI (Graphical User Interface) for pbat.

     'pbat.last' returns an object of class 'pbat' of the last command
     file run from running 'pbat()'. Note this is also returned from
     'pbat'. However, this command is provided because rerunning a
     command in pbat can be a very time-consuming process).

     'pbat.last.rawResults' prints out the raw text file of the output
     (particularly useful if the output of pbat cannot be parsed
     properly, in the unexpected event the output could not be parsed
     correctly). This should work even with the new option of not
     loading the output in.

     'pbat.m' runs pbat according to an expression, from 'phe' class
     (phenotype information), 'ped' class (pedigree information), and
     various options.

     'pbat.obj' runs pbat with a 'ped' class object (pedigree
     information), a `phe' class object (phenotype information), and
     various other options.

     'pbat.files' runs pbat according to a set of filenames and
     commands.

     'pbat.create.commandfile' creates a command file for Christoph
     Lange's pbat software with respect to two files on disk (.phe,
     .ped).

     Some options are only available for the respective pbat-gee (G),
     pbat-pc (P), pbat-logrank (L). If a parameter is `R'equired for a
     specific version, it will be denoted, for example, by (G-R).

_U_s_a_g_e:

     pbat()

     pbat.last()

     pbat.last.rawResults()

     pbat.m( formula, phe, ped, fbat="",
             max.pheno=1, min.pheno=1,
             null="no linkage, no association", alpha=0.05,
             trans.pheno="none", trans.pred="none", trans.inter="none",
             scan.pred="all", scan.inter="all",
             scan.genetic="additive",
             offset="gee",
             screening="conditional power", distribution="default",
             logfile="",
             max.gee=1,
             max.ped=14, min.info=20,
             incl.ambhaplos=TRUE, infer.mis.snp=FALSE,
             sub.haplos=FALSE, length.haplos=2, adj.snps=TRUE,
             overall.haplo=FALSE, cutoff.haplo=FALSE,
             output="normal",
             max.mating.types=10000,
             commandfile="",
             future.expansion=NULL,
             LOAD.OUTPUT=TRUE,
             monte=0,
             mminsnps=NULL, mmaxsnps=NULL,
             mminphenos=NULL, mmaxphenos=NULL,
             env.cor.adjust=FALSE,
             gwa=FALSE,
             snppedfile=FALSE,
             extended.pedigree.snp.fix=FALSE )
      
     pbat.obj( phe, ped, file.prefix, phenos="", offset="gee", LOAD.OUTPUT=TRUE, ... )

     pbat.files( pedfile, phefile, fbat="gee",
                 commandfile="",
                 logrank.outfile="",
                 preds="", preds.order="",
                 max.pheno=1,
                 LOAD.OUTPUT=TRUE,
                 ... )

     pbat.create.commandfile( pedfile, phefile="",
            snps="",
            phenos="", time="", # (only one of 'phenos' and 'time' can be set)
            preds="", preds.order="",
            inters="",
            groups.var="", groups="",
            fbat="gee",
            censor="",
            max.pheno=1, min.pheno=1,
            null="no linkage, no association", alpha=0.05,
            trans.pheno="none", trans.pred="none", trans.inter="none",
            scan.pred="all", scan.inter="all",
            scan.genetic="additive",
            offset="gee",
            screening="conditional power", distribution="default",
            logfile="",
            max.gee=1,
            max.ped=7, min.info=20,
            haplos=NULL, incl.ambhaplos=TRUE, infer.mis.snp=FALSE,
            sub.haplos=FALSE, length.haplos=2, adj.snps=TRUE,
            overall.haplo=FALSE, cutoff.haplo=FALSE,
            output="normal",
            max.mating.types=10000,
            commandfile="",
            future.expansion=NULL,
            LOGFILE.OVERRIDE=TRUE,
            monte=0,
            mminsnps=NULL, mmaxsnps=NULL,
            mminphenos=NULL, mmaxphenos=NULL,
            env.cor.adjust=FALSE,
            gwa=FALSE,
            snppedfile=FALSE,
            extended.pedigree.snp.fix=FALSE )

_A_r_g_u_m_e_n_t_s:

 formula: Symbolic expression describing what should be processed.  See
          `examples' for more information.

     phe: `phe' object as described in 'write.phe'. If you do not have
          a phe file set this to NULL (i.e when you are only using
          AffectionStatus from the pedigree).

     ped: `ped' object as described in 'write.ped'.

file.prefix: Prefix of the output datafile (phe & ped must match)

 pedfile: Name of the pedigree file (.ped) in PBAT-format (extension
          `.ped' is optional).

 phefile: Name of the phenotype file (.phe) in PBAT-format. The default
          assumes the same prefix as that in 'pedfile'. Leave empty or
          set to the empty string "" if you do not have a phenotype
          file (i.e. you are only using AffecitonStatus). In the case
          of no phenotype file, one must be created; it will be in
          'empty_phe.phe', and requires loading in the pedigree file
          into R.

     ...: Options in higher level functions to be passed to
          'pbat.create.commandfile'.

    fbat: Selects the fbat statistic used the data analysis.

          '"gee"' = The FBAT-GEE statistic simplifies to the standard
          univariate FBAT-statistic. If several phenotypes are
          selected, all phenotypes are tested simultaneously, using
          FBAT-GEE. The FBAT-GEE statistic can handle any type of
          multivariate data.

          '"pc"' = FBAT extension for longitudinal phenotypes and
          repeated measurements.

          '"logrank"' = FBAT-extensions of the classical LOGRANK and
          WILCOXON tests for time-on-onset data. Kaplan-Meier plots for
          the analyzed data set will be generated and plotted. 

max.pheno: (G,P) The maximum number of phenotypes that will be analyzed
          in the FBAT-statistic.

min.pheno: (G,P) The minimum number of phenotypes that will be analyzed
          in the FBAT-statistic.

    null: Specification of the null-hypothesis.

          '"no linkage, no association"' = Null-hypothesis of no
          linkage and no association.

          '"linkage, no association"' = Null-hypothesis of linkage, but
          no association. 

   alpha: Specification of the significance level.

trans.pheno: Transformation of the selected phenotypes.

          '"none"' = no transformation

          '"ranks"' = transformation to ranks

          '"normal score"' = transformation to normal score

          The default choice is '"none"', although it recommended to
          use transformation to normal scores for quantitative
          phenotypes. 

trans.pred: Transformation of the selected predictor
          variables/covariates:

          '"none"' = no transformation

          '"ranks"' = transformation to ranks

          '"normal score"' = transformation to normal score

          The default choice is '"none"', although it recommended to
          use transformation to normal scores for quantitative
          covariates. 

trans.inter: Transformation of the selected interaction variables

          '"none"' = no transformation

          '"ranks"' = transformation to ranks

          '"normal score"' = transformation to normal score

          The default choice is '"none"', although it recommended to
          use transformation to normal scores for quantitative
          interaction variables. 

scan.pred: (G,P) Computation of all covariate sub-models:

          '"all"' = The selected FBAT statistic is computed with
          adjustment for all selected covariates/predictors.

          '"subsets"' = The selected FBAT statistic is computed for all
          possible subsets of the selected covariates/predictor
          variables.  The command is particularly useful to examine the
          dependence of significant results on the selection of a
          covariate model. 

scan.inter: (G,P) Computation of all interaction sub-models:

          '"all"' = The selected FBAT statistic is computed including
          all selected interaction variables.

          '"subsets"' = The selected FBAT statistic is computed for all
          posible subsets of the interaction variables. 

scan.genetic: Specification of the mode of inheritance:

          '"additive"' = Additive model

          '"dominant"' = Dominant model

          '"recessive"' = Recessive model

          '"heterozygous advantage"' = Heterozygous advantage model

          '"all"' = The FBAT-statistics are computed for all 4 genetic
          models 

  offset: Specification of the covariate/predictor variables
          adjustment:

          '"none"' = No adjustments for covariates/predictor variables.
          You need to select this for dichotomous traits.

          '"max power"' = Offset (=FBAT adjustment for covariates and
          interaction variables) that maximizes the power of the
          FBAT-statistic (computationally slow, efficiency dependent on
          the correct choice of the mode of inheritance)

          '"gee + marker score"' = Offset (=FBAT adjustment for
          covariates and interaction variables) based on standard
          phenotypic residuals obtained by GEE-estimation including the
          expected marker score (E(X|H0)), all covariates and
          interaction variables.

          '"gee"' = Offset (=FBAT adjustment for covariates and
          interaction variables) based on standard phenotypic residuals
          obtained by GEE-estimation including all covariates and
          interaction variables. (default - most of the time, with the
          exception of selecting from the gui interface (not the
          command line) AffectionStatus)

          '(numeric value)' = This only works for AffectionStatus; set
          a numeric value (i.e. `0.13' without the `' marks) to this. 

screening: Specification of the screening methods to handle the
          multiple comparison problem for multiple SNPs/haplotypes and
          a set of phenotypes.

          '"conditional power"' = Screening based on conditional power
          (parametric approach)

          '"wald"' = Screening based on Wald-tests (non-parametric
          approach) 

distribution: Screening specification of the empirical phenotypic
          distribution

          '"default"'

          '"jiang"' = Approach by Jiang et al (2006)

          '"murphy"' = Approach by Murphy et al (2006)

          '"naive"' = Naive allele freq estimator

          '"observed"' = Observed allele frequencies 

 logfile: Specification of the log-file. By default, PBAT selects an
          unique file-name for the log-file, i.e. "pbatlog...".

 max.gee: (G) Specification of the maximal number of iterations in the
          GEE-estimation procedure.

 max.ped: Specification of the maximal number of proband in one
          extended pedigrees. 

min.info: Specification of the minimum number of informative families
          required for the computation of the FBAT-statistics.

incl.ambhaplos: This command defines the handling of ambiguous
          haplotypes in the haplotypes analysis. Choices: 

          'TRUE' = Ambiguous haplotypes (phase can not be inferred) are
          included in the analysis and are weighted according to their
          estimated frequencies in the probands.

          'FALSE' = Ambiguous haplotypes are excluded from the
          analysis. 

infer.mis.snp: Handling of missing genotype information in the
          haplotypes analysis.

          'FALSE' = Individuals with missing genotype information are
          excluded from the analysis. This is the analysis also
          implemented in the HBAT option of the FBAT-program.

          'TRUE' = Individuals with missing genotype information are
          included in the analysis. The algorithm of Horvath et al
          (2004) is applied to all individuals, even if they have
          missing genotype information. This results in more ambiguous
          haplotypes. 

sub.haplos: 'FALSE' = The haplotypes defined by the all SNPs given in
          the haplotype-block definition are analyzed.

          'TRUE' = All haplotypes are analyzed that are defined by any
          subset of SNPs in the haplotypes block definition. 

length.haplos: Defines the haplotype length when subhaplos='TRUE'.

adj.snps: Takes effect when subhaplos='TRUE'.

          'FALSE' = All sub-haplotypes are analyzed

          'TRUE' = Only the sub-haplotypes are analyzed for which the
          first constituting SNPs are adjacent. 

overall.haplo: Specification of an overall haplotypes test. When this
          command is included in the batch-file, only one level of the
          '"groups"' variable can be specified.

          'FALSE' = no overall test

          'TRUE' = an overall test is computed testing all haplotypes
          defined by the same set of SNPs simultaneously. This option
          can not be applied when sub.haplos='TRUE'. 

cutoff.haplo: The minimum haplotypes frequency so that a haplotypes is
          included in the overall test.

  output: '"normal"' = Normal PBAT output.

          '"short"' = Shorter output.  This is mostly for use in
          conjunction with 'gwa', where there is a lot of output.

          '"detailed"' = Detailed output for each family is created. 

max.mating.types: Maximal number of mating types in the haplotype
          analysis.

commandfile: Name of the temporary command file that will be created to
          send to the pbat.  It is suggested to leave this blank, and
          an appropriate name will be chosen with a time stamp.

future.expansion: (Only included for future expansion of pbat.) A
          vector of strings for extra lines to write to the batchfile
          for pbat. 

logrank.outfile: (L) Name of the file to store the R source code to
          generate the plots for logrank analysis.

    snps: Vector of strings for the SNPs to process. Default processes
          all of the SNPs.

  phenos: (G,P) Vector of strings for the phenotypes/traits for the
          analysis.  If none are specified, then all are analyzed. 
          (Note: this _must be left empty for logrank analysis_,
          instead specify the time to onset with the time variable. 

    time: (L-R) Time to onset variable.  `phenos' cannot be specified
          when this is used, but it _must be set for logrank_.

   preds: Vector of strings for the covariates for the test statistic.

preds.order: Vector of integers indicating the order of 'preds' - the
          order for the vector of covariates for the test statistic.

  inters: Vector of strings for the interaction variables.

groups.var: String for the grouping variable.

  groups: Vector of strings corresponding to the groups of the grouping
          variable (groupsVar).

  censor: (L-R) String of the censoring variables. In the corresponding
          data, this variable has to be binary.

  haplos: List of string vectors representing the haplotype blocks for
          the haplotype analysis. For example, 'list(
          block1=c("m1","m2"), block2=c("m3","m4") )' defines 2 
          haplotype-blocks where the first block is defined by SNPs m1
          and m2, and the second by SNPs m3 and m4. 

LOGFILE.OVERRIDE: When using the 'sym' option in read.ped and read.phe,
          when this is set to TRUE (default), the PBAT logfile is put
          in the current working directory; if FALSE, then it is put in
          the same directory as the datafile.

LOAD.OUTPUT: When TRUE, loads the output into R (generally
          recommended). When FALSE, it leaves it in the output left
          from PBAT (in case output is too large to load into memory).

   monte: When this is nonzero, monte-carlo based methods are used to
          compute the p-values instead, according to the number of
          iterations supplied. 1000 iterations is suggested.

mminsnps: Multi-marker multi-phenotype tests: the minimum number of
          snps to be tested.

mmaxsnps: Multi-marker multi-phenotype tests: the maximum number of
          snps to be tested.

mminphenos: Multi-marker multi-phenotype tests: the minimum number of
          phenotypes to be tested.

mmaxphenos: Multi-marker multi-phenotype tests: the maximum number of
          phenotypes to be tested.

env.cor.adjust: Whether to adjust for environmental correlation.

     gwa: Whether to use (g)enome (w)ide (a)cceleration mode.  This is
          faster for genome-wide association tests, and has slightly
          less output.

snppedfile: Whether the pedigree file contains just snps. When this is
          true, it employs a more optimal storage technique and uses
          much less memory. It is especially advantageous for
          genome-wide studies.

extended.pedigree.snp.fix: Set to TRUE when you are using a dataset
          with large extended pedigrees. This will not work with any
          mode but `single' mode currently [see pbat.set(...)]. This is
          also sometimes necessary for multi-allelic markers (i.e. not
          binary markers). 

_D_e_t_a_i_l_s:

     These commands require `pbatdata.txt' to be in the working
     directory; if not found, the program will attempt to (1) copy the
     file from the directory where pbat is, (2) copy it from anywhere
     in the path, or (3) complain and die.

     Linux warning: the file `pbatdata.txt' appears not to have shipped
     with the current (as of writing this) linux version; to fix this
     just download the windows version as well and copy the file from
     there to the same directory as pbat.

     It is recommended to set 'LOAD.OUTPUT' to 'FALSE' when dealing
     with very large numbers of SNPs.

     These commands will also generate a lot of output files in the
     current working directory when interfacing with pbat.  These files
     will be time-stamped so concurrent analysis in the same directory
     can be run. _Race condition_: if two logrank analysis finish at
     _exactly the same time_, then the plots for one might be lost
     and/or get linked to the wrong analysis.  This should be a rather
     rare occurence, and is an unpreventable result of pbat always
     sending this output to only one filename.  Workaround to race
     condition: create another directory and use that as your current
     working directory instead.

     Note that multi-marker / multi-phenotype mode is not supported in
     parallel at this time, so if you are having problems try running
     the command 'pbat.setmode("single")', or setting it to single from
     the graphical interface before running these tests.

     WARNING: Note the 'extended.pedigree.snp.fix' option, which is
     important for getting more accurate results in very extended
     pedigrees.  It uses a slower but more accurate pedigree
     reconstruction method.

_V_a_l_u_e:

     `pbat', `pbat.last', `pbat.m', `pbat.obj', and `pbat.files' return
     an object of class 'pbat'.  Methods supported by  this include
     'plot(...)', 'summary(...)', and 'print(...)'.  Follow the first
     three links in the 'see also' section of this file for more
     details.

_R_e_f_e_r_e_n_c_e_s:

     This was taken with only slight modification to accomodate the
     interface from Christoph Lange's description of the commands for
     the pbat program, (which was available with the software at the
     time of this writing), available at the PBAT webpage: <URL:
     http://www.biostat.harvard.edu/~clange/default.htm>

     P2BAT webpage: <URL:
     http://www.people.fas.harvard.edu/~tjhoffm/pbatR.html>

     FBAT webpage (lists a lot of references in relation to both of
     these programs): <URL: http://biosun1.harvard.edu/~fbat/fbat.htm>

     More pbat references:

     Hoffmann, T. and Lange, C. (2006) P2BAT: a massive parallel
     implementation of PBAT for genome-wide association studies in R.
     Bioinformatics. Dec 15;22(24):3103-5.

     Jiang, H., et al. (2006) Family-based association test for
     time-to-onset data with time-dependent differences between the
     hazard functions. Genet. Epidemiol, 30, 124-132.

     Laird, N.M. and Lange, C. (2006) Family-based designs in the age
     of large-scale gene-association studies. Nat. Rev. Genet, 7.

     Lange, C., et al. (2003) Using the noninformative families in
     family-based association tests: a powerful new testing strategy.
     Am. J. Hum. Genet, 73, 801-811.

     Lange, C., et al. (2004a) A family-based association test for
     repeatedly measured quantitative traits adjusting for unknown
     environmental and/or polygenic effects. Stat. Appl. Genet. Mol.
     Biol, 3.

     Lange, C., et al. (2004b) Family-based association tests for
     survival and times-to-onset analysis. Stat. Med, 23, 179-189.

     Van Steen, K., et al. (2005) Genomic screening and replication
     using the same data set in family-based association testing. Nat.
     Genet, 37, 683-691.

_S_e_e _A_l_s_o:

     'summary.pbat', 'plot.pbat', 'print.pbat',

     'as.ped', 'as.pedlist', 'read.ped'

     'as.phe', 'read.phe',

     'top'

_E_x_a_m_p_l_e_s:

     ##########################
     ## pbat.m(...) examples ##
     ##########################

     ## Not run: 

     ## Note, when you run the example (or anything else) you will generally
     ##  get a warning message that the column headers were guessed.
     ## This means they were guessed, and while I've tried to catch most
     ##  cases, the warning stands for ones I might have missed.

     ## These cannot be run verbatim, and are just meant to be examples.

     ##############################
     ## Further formula examples ##
     ##############################

     # load in the data
     # Here we assume that:
     #  data.phe contains 'preds1', 'preds2', 'preds3', 'time',
     #                     'censor', 'phenos1', ... 'phenos4'
     #  data.ped contains 'snp1', 'snp2', 'snp3',
     #                     'block1snp1','block1snp2',
     #                     'block2snp1','block2snp2'
     data.phe <- read.phe( "data" )
     data.ped <- read.ped( "data" )

     # This model does just the affection status (always given as
     #  AffectionStatus) as the phenotype, no predictor covariates, and all
     #  the snps for a snps analysis.
     # Since affection status is dichotomous, we additionally set
     #  distribution='categorical'
     #  offset='none'
     # NONE is a special keyword to indicate none, and can be only used in
     #  this case (note that it is _case_ _sensative_);
     #  otherwise one specifies values from the phenotype object, after and
     #  including AffectionStatus.
     res <- pbat.m( AffectionStatus ~ NONE, phe, ped, fbat="gee",
                    distribution='categorical', offset='none', ... )
     summary( res )
     res  # equivalent to print(res)

     # basic model with one phenotype, does all snps (if none specified)
     pbat.m( phenos1 ~ preds1, phe, ped, fbat="gee" )

     # same model, but with more phenotypes; here we test them all at once
     pbat.m( phenos1 + phenos2 + phenos3 ~ preds1, phe, ped, fbat="gee" )

     # same model as just before, but now supposing that these phenotypes are
     #  instead from a longitudinal study
     pbat.m( phenos1 + phenos2 + phenos3 ~ preds1, phe, ped, fbat="pc" )

     # like our second model, but the mi() tells it should be a marker
     #  interaction
     pbat.m( phenos1 ~ mi(preds1), phe, ped, fbat="gee" )

     # logrank analysis - fbat need not be set
     # uses more than one predictor variable
     res <- pbat.m( time & censor ~ preds1 + preds2 + preds3, phe, ped )
     plot( res )

     # single snp analysis (because each snp is seperated by a vertical bar
     #  '|'), and stratified by group (presence of censor auto-indicates
     #  log-rank analysis).  Note that the group is at the end of the
     #  expression, and _must_ be at the end of the expression
     res <- pbat.m( time & censor ~ preds1^3 + preds2 | snp1 | snp2 |
              snp3 / group, temp )
     plot( res )

     # haplotype analysis, stratified by group
     res <- pbat.m( time & censor ~ preds1^2 + preds2^3 | block1snp1
                    + block1snp2 | block2snp1 + block2snp2 / group, temp )

     # set any of the various options
     res <- pbat.m( phenos ~ preds, phe, ped, fbat="pc",
                    null="linkage, no association", alpha=0.1 )

     ## New multimarker test (as described above)
     # mmaxphenos and mmaxsnps are set to the minimum if not specified
     res <- pbat.m( phenos1 + phenos2 + phenos3 ~ preds | m1 | m2 | m3 | m4,
                    phe, ped, fbat="pc", mminphenos=2, mminsnps=2 )

     ## And the top markers by conditional power
     top( res )
     ## End(Not run)

     ############################
     ## pbat.obj(...) examples ##
     ############################

     ## Not run: 
     # These will not function; they only serve as examples.

     # ... just indicates there are various options to be put here!
     res <- pbat.obj("pedfile", snps=c("snp1,snp2"), preds="pred1", ... ) 
     summary(res)
     res

     # plot is only available for "logrank"
     res <- pbat.obj(..., fbat="logrank")
     plot( res )
     ## End(Not run)

     ##############################
     ## pbat.files(...) examples ##
     ##############################

     ## Not run: 
     # These will not function, but only serve as examples.

     # Note in the following example, both "pedfile.ped" and "pedfile.phe"
     #  must exist.  If the names differed, then you must specify the
     #  option 'phe="phefile.phe"', for example.
     res <- pbat.files( "pedfile", phenos=c("phenos1","phenos2"),
                        screening="conditional power" )
     summary(res)
     res
     ## End(Not run)

