#!/usr/bin/perl

#Ce programme etiquette les haplotypes malades (ajout d'un G) ou tmoins (ajout d'un C), en fonction d'un seuil de malade et tmoins qui le porte.
#Le programme prend en entre un fichier .paup, et redonne un autre .paup

use strict;
use diagnostics;
use warnings;
use Getopt::Long; # qw(:config permute);
use Pod::Usage;
#use Getopt::Std;

our($opt_h,$opt_i, $opt_o, $opt_e, $opt_p, $opt_t, $opt_l, $opt_j, $opt_g);

our $VERSION;
$VERSION = sprintf "0.%03d", q$Revision: 116 $ =~ /(\d+)/g;

sub DefineAncDer {
    my $data_type=shift;
    if ($data_type == 0) {
	my $tem=0;
	my $mal=1;
	return ($tem, $mal);
    } elsif ($data_type == 1) {
	my $tem="C";
	my $mal="G";
	return ($tem, $mal);	
    }
}

sub ReadCorrespond 
{
    my($name_correspond) =shift;
    my($ligne, @tableau);
    my(%correspondance);
    open (CORRESP, '<', $name_correspond) || die "Unable to open file $name_correspond: $!\n";
    while ($ligne=<CORRESP>) {
	chomp($ligne);
	if ($ligne =~ /^$/) {
	    next;
	}
	@tableau=split(/\s+/, $ligne);
	if ($#tableau != 2) {
	    die "error in $name_correspond: not 3 columns at line '$ligne'\n";
	} else {
	    $tableau[2]=~ s/c//;
	    $tableau[1]=~ s/m//;
	    if ($tableau[1] =~ /c/ || $tableau[2] =~ /m/) {
		die "You have probably exchanged the order of cases and controls in file $name_correspond. It should be: haplo_name m_case_number c_control_number\n";
		}
	    $correspondance{$tableau[0]}->{"case"}=$tableau[1]+0;
	    $correspondance{$tableau[0]}->{"control"}=$tableau[2]+0;	
	}
    }
    my($clefs);
    #DEBUG
   # foreach $clefs (keys %correspondance) {
   # print "$clefs case: ", $correspondance{$clefs}->{"case"}, "\n";
   # 	print "$clefs, control: ",$correspondance{$clefs}->{"control"}, "\n";
   # }
    return(\%correspondance);
}

sub travail
{
    
    my($seuil)=shift;
    my($data_type)=shift;
    my($proportion_malades)=shift;
    my $low =shift;
    my $name_correspond=shift;
    my $outgroup = shift;
    my($ligne);
    my($temoin, $malade, $sequence, $nom, $debut, $ancetre);
    my($anc, $num_car, $prop_mal, $prop_tem);
    my(@tableau, @tab2);
    my($tem, $mal)=DefineAncDer($data_type);
    my($correspondance)=ReadCorrespond($name_correspond);

   #foreach my $clefs (keys %{$correspondance}) {
#	print "$clefs case ", $correspondance->{$clefs}->{"case"}, "\n";
#    	print "$clefs, control ",$correspondance->{$clefs}->{"control"}, "\n";
#    }
    my $found_outgroup=0;
    my $ici = 0;
    my $phylo_prog= "Phylip";
    my $compteur=0;
    while ($ligne=<STDIN>){
	chomp($ligne);
	$compteur++;
	my $diese='#';
	if ($ligne =~ /^\s*$diese[N|n]exus\s*$/) {
	    $phylo_prog="PAUP";
	} 	
	if ($phylo_prog eq "PAUP") {
	    if ($ligne =~ /^\s*matrix\s*$/) {
		print $ligne, "\n";
		$ici=1;
		next
	    }
	    if ($ligne =~ /^\s*;\s*$/) {
		
		$ici=0;
	    }
	} else {
	    if ($compteur>1) {
		$ici=1;
	    } 
	}
	    
	if ($ici==0 && $phylo_prog eq "PAUP") {
	    if ($ligne =~ /dimension ntax=([0-9]+) nchar=([0-9]+);/) {
		$num_car=$2+1;
		print "dimension ntax=$1 nchar=", $num_car, ";\n";
	    }elsif ($ligne =~ /format symbols=\"([0-9ATGCU]+)\"/) {
		my($format)=$1;
		my($format_old)=$1;
		$format =~ tr/GC//d;
		$ligne =~ s/$format_old/${format}CG/;
		print $ligne, "\n";
	    } elsif ($ligne =~ /ancstates\s+\*anc\s+vector\s*=\s*([0-9ATCG]+)\s*;/) {
		$anc=$1;
		$anc=$anc."?";
		$ligne =~ s/$1/$anc/;
		print $ligne,"\n";
	    } elsif ($ligne=~ /begin paup;/) {
		print $ligne,"\n";
		print "exclude $num_car; \n";
	    } elsif ($ligne=~ /\s*describetrees/) {
		print "include $num_car;\n";
		print $ligne,"\n";
	    } elsif ($ligne =~ /^\s*([0-9]+)\s+([0-9]+)$/) {
		$num_car=$2+1;
		print "$1\t$num_car\n";
	    } else {
		print $ligne, "\n";
	    }
	} elsif ($ici==0 && $phylo_prog eq "Phylip") {
	    if ($ligne =~ /^\s*([0-9]+)\s+([0-9]+)\s*$/) {
		print $1, "\t", $2+1, "\n";
	    } else {
		die "Strange line $ligne in Phylip file\n";
	    }
	} elsif ($ici==1) {
	    if ($ligne =~ /^\s+$/) {
		next;
	    }  elsif ($ligne =~ /^\s*\[.+\]$/) {
	#	print STDERR "TTTTT\n";
		next;
	    } 	    else {
		@tableau=split(/\s+/, $ligne);
		$sequence=$tableau[1];
		$nom=$tableau[0];
		if ($nom eq $outgroup){
		    #	$ancetre=$sequence."?";
		    print "$nom  $sequence?\n";
		    $found_outgroup++;
		    next;
		}
		#    } else {
		#if ($debut =~ /^\s*H[0-9]{3}_m[0-9]{3}_c[0-9]{3}/) {
		#   @tab2=split(/_/,$debut);
		# print $tab2[0],"\n";
		#  $temoin=$tab2[2];
		# $temoin =~ s/c//;
		# print STDERR "temoin=$temoin\n";
		# $malade=$tab2[1];
		# $malade =~ s/m//;
		# if ($malade =~ /c/ || $temoin =~ /m/) {
		#	die "You have probably interverti cases and controls in file correspond.txt. It should be: haplo_name m_case_number c_control_number\n";
		#   }
#		if ($debut eq $anc_name){
#		    $found_anc++;
#		}	
		#$nom=$tableau[0];
		#	print "$nom\n";
		if (not exists ($correspondance->{$nom})){
		    print STDERR "$nom is not found in the file $name_correspond. Assuming it is the outgroup.\nThe number of cases and controls affected to this sequence is set to 0\n";
		  $correspondance->{$nom}->{"case"}=0;
		  $correspondance->{$nom}->{"control"}=0;  
		}
		$malade=$correspondance->{$nom}->{"case"};
		$temoin=$correspondance->{$nom}->{"control"};
		
		if ($malade == 0 && $temoin == 0) {
		    print STDERR "$nom is carried by 0 cases and 0 controls. The state ? has been attributed to the S character\n";
		    $sequence.="?";
		    print "$nom  ", $sequence, "\n";
		    next;
		}
		
		#	print " $nom mal=$malade\n";   
		$prop_mal=$malade/($malade+$temoin);
		$prop_tem=$temoin/($malade+$temoin);
		#	print "M=$malade T=$temoin\n";
		#print "test=$test\n";
		#	if ($test==0) { # test: difference |mal-tem| >=seuil 
		#    if ($malade > $temoin && $malade-$temoin>=$seuil) {
		#$sequence.="G";
		#    } elsif ($malade < $temoin && $temoin-$malade>=$seuil) {
		#	$sequence.="C";
		#    } else {
		#	$sequence.="?";
		#    }
		#} elsif ($test==1) {
		#  if ($malade+$temoin==1) {
		#$sequence.="?";
		if ($low !=0 && $malade+$temoin==1) {
		    $sequence.="?";
		}  else {
		    if ($prop_mal>$proportion_malades+
			$seuil*sqrt($prop_mal*$prop_tem/($malade+$temoin))) {
			$sequence.=$mal;
		    } elsif ($prop_mal<$proportion_malades-
			     $seuil*sqrt($prop_mal*$prop_tem/($malade+$temoin))) {
			$sequence.=$tem;
		    } else {
			$sequence.="?";
		    }   
			#}
		}
		
		print "$nom  ", $sequence, "\n";
		#"_m$malade", "_c$temoin\t", $sequence, "\n";
	    }
	    
	} 
    }
    #print "anc? $found_anc\n";
   if ($found_outgroup==0 && $outgroup ne "nooutgroup") {
	die "outgroup not found in the file\n";
    } elsif ($found_outgroup ==1 && $outgroup eq "noanc") {
	die " false outgroup found\n";
    } elsif ($found_outgroup > 1) {
	die "Too many outgroups found ($found_outgroup outgroup)";
    }
}

sub usage {
    my $msg =shift;
    my($progname) =shift;
    print STDERR "Error! ".$msg;
    print STDERR "usage :$progname [options]\n";
    print STDERR " Options :\n";
    print STDERR "    [-h]  this help\n";
    print STDERR "     -i   input file\n";
    print STDERR "     -j   input2 file (correspond.txt)\n";
    print STDERR "     -o   output file\n";
    print STDERR "     -t   data type: SNP or DNA\n";
# ancienne option -t test: 0= mal-tem>seuil 1= seuil proportion0+/-sqr(pq/n)]
    print STDERR "     -p   proportion of cases in the whole data set\n";
    print STDERR "     -e   epsilon parameter\n";
    print STDERR "    [-g]  name of the outgroup\n"; 
    print STDERR "    [-l]  if -l is present, it forces the state of S to be ? for haplotypes carried by only one individual\n"; 
}

sub main
{
    my($progname);
    my($seuil, $test, $proportion);
    
    my %options= (
    	"first-input-file" => \$opt_i,
	"second-input-file" => \$opt_j,
	"output-file" => \$opt_o,
	"epsilon" => \$opt_e,
	"data-type" => \$opt_t,
	"proportion" => \$opt_p,
	"outgroup" => \$opt_g,
	"low" => \$opt_l,
	);
    	
    #getopts('ho:i:j:e:t:p:l');
    GetOptions (\%options,
		"version",
		"short-help|h",
		"help",
		"man",
		"first-input-file|i=s",
                "second-input-file|j=s",
                "output-file|o=s",
		"epsilon|e=s",
		"data-type|t=s",
		"proportion|p=s",
		"outgroup|g=s",
		"low|l!",
		) or pod2usage(2);
    if (defined($options{"version"})) {
	print $0, " version ", $VERSION, "\n";
	print "(Perl version ", $], ")\n";
	exit 0;
    }
    if (defined($options{"short-help"})) {
	pod2usage(-exitstatus => 0, -verbose => 0);
    }
    if (defined($options{"help"})) {
	pod2usage(-exitstatus => 0, -verbose => 1);
    }
    if (defined($options{"man"})) {
	pod2usage(-exitstatus => 0, -verbose => 2);
    }
    
    if ($opt_i) {
	open(STDIN, '<', $opt_i) or die "Impossible to open $opt_i : $!" ;
    }
    
    my $correspond_name;
    if ($opt_j) {
	$correspond_name=$opt_j;
    } else {
	$correspond_name="correspond.txt"
	}
    if ($opt_o) {
	open(STDOUT, '>', $opt_o) or die "Impossible to open $opt_o : $!" ;
    }
    if ($opt_e) {
	$seuil=$opt_e;
    } else {
	usage("The epsilon parameter is not defined!!\n", $progname);
    } 
    my($data_type);
    if (defined($opt_t)) {
	if ($opt_t =~ /[Dd][Nn][Aa]/) {
	    $data_type=1;
	} elsif ($opt_t =~ /[Ss][Nn][Ps]/) {
	    $data_type=0;
	} else {
	    usage("The data type (SNP or DNA) is missing\n", $progname);
	}
	# if (defined($opt_t)) {
	#	if ($opt_t==0) {
	#	    $test=0;
	#	} elsif ($opt_t==1) {
	#	    $test=1;
	#	} else {
	#	    $test=-999;
	#	    print STDERR "illegal value of opt_t\n";
	#	}
	#  } else {
	#	die "manque le numero du test: 0: mal-tem=seuil  1: seuil=P0+/-sqr(pq/n)!!\n";
	# }    
	if ($opt_p) {
	    $proportion=$opt_p;
	} else {
	    usage("The proportion of cases in the sample is missing!\n", $progname);
	}
	my($low);
	# Si $low !=0, if only one case or one control, then the state of S is ?
	if ($opt_l) {
	    $low=1;
	} else {
	    $low=0;
	}
	
	my $outgroup="nooutgroup";
	if ($opt_g) {
	    $outgroup = $opt_g;
	}    
	travail($seuil, $data_type,  $proportion, $low, $correspond_name, $outgroup);
	
    }
}


main;

__END__
    
=head1 NAME

altree-add-S - Title...

=head1 SYNOPSIS

altree-add-S [options]

 Options:
    --version             program version
    --short-help|h        brief help message
    --help                help message with options descriptions
    --man                 full documentation
    --first-input-file|i  input file 1
    --second-input-file|j input file 2 nb cases/controls per haplotypes
    --output-file|o       output file
    --epsilon|e           epsilon value
    --data-type|t         data type: SNP or DNA
    --proportion|p        proportion of cases in the sample
    --ancestor|a          name of ancestral haplotype
    [--outgroup|g]        name of the outgroup
    [--low|l]             disease statut for haplotype carried by one individual
    

=head1 OPTIONS

=over 8

=item B<--version>

Print the program version and exits.

=item B<--short-help>

Print a brief help message and exits.

=item B<--help>

Print a help message with options descriptions and exits.

=item B<--man>

Prints the manual page and exits.

=item B<--first-input-file|i>

Input file 1 (paup or phylip file)

=item B<--second-input-file|j>

Input file 2, contains the number of times a given haplotypes is carried by  case and control individuals

=item B<--output-file|o>

Output file

=item B<--epsilon|e>

espilon parameter (see formula in the documentation)

=item B<--data-type|t>  

Type of data: DNA (ATGCU) or SNP (0-1)

=item B<--proportion|p>

Proportion of case individuals in the sample

=item B<--outgroup|g>

Name of the outgroup (if it is not in the file containing the number of cases and controls per haplotype)

=item B<--low|l>

If this option is present, it forces the state of the character S to be "?" for haplotypes carried by only one individual

=back

=head1 DESCRIPTION

B<This program> adds a new character (called "character S") to each haplotype in the input file according to the number of cases and controls carrying it.  

=cut
