#! /usr/bin/perl -w

# Copyright (c) 1999 University of Cambridge.
# See the file NOTICE for conditions of use and distribution.

# Perl script to generate statistics from one or more Exim log files.

# Usage: eximstats [<options>] <log file> <log file> ...

# 1996-05-21: Ignore lines not starting with valid date/time, just in case
#               these get into a log file.
# 1996-11-19: Add the -h option to control the size of the histogram,
#               and optionally turn it off.
#             Use some Perl 5 things; it should be everywhere by now.
#             Add the Perl -w option and rewrite so no warnings are given.
#             Add the -t option to control the length of the "top" listing.
#             Add the -ne, -nt options to turn off errors and transport
#               information.
#             Add information about length of time on queue, and -q<list> to
#               control the intervals and turn it off.
#             Add count and percentage of delayed messages to the Received
#               line.
#             Show total number of errors.
#             Add count and percentage of messages with errors to Received
#               line.
#             Add information about relaying and -nr to suppress it.
# 1997-02-03  Merged in some of the things Nigel Metheringham had done:
#               Re-worded headings
#               Added received histogram as well as delivered
#               Added local senders' league table
#               Added local recipients' league table
# 1997-03-10  Fixed typo "destinationss"
#             Allow for intermediate address between final and original
#               when testing for relaying
#             Give better message when no input
# 1997-04-24  Fixed bug in layout of error listing that was depending on
#               text length (output line got repeated).
# 1997-05-06  Bug in option decoding when only one option.
#             Overflow bug when handling very large volumes.
# 1997-10-28  Updated to handle revised log format that might show
#               HELO name as well as host name before IP number
# 1998-01-26  Bugs in the function for calculating the number of seconds
#               since 1970 from a log date
# 1998-02-02  Delivery to :blackhole: doesn't have a T= entry in the log
#               line; cope with this, thereby avoiding undefined problems
#             Very short log line gave substring error
# 1998-02-03  A routed delivery to a local transport may not have <> in the
#               log line; terminate the address at white space, not <
# 1998-09-07  If first line of input was a => line, $thissize was undefined;
#               ensure it is zero.
# 1998-12-21  Adding of $thissize from => line should have been adding $size.
#             Oops. Should have looked more closely when fixing the previous
#               bug!
# 1999-11-12  Increased the field widths for printed integers; numbers are
#               bigger than originally envisaged.


use integer;


##################################################
#             Static data                        #
##################################################

@tab62 =
  (0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0,     # 0-9
   0,10,11,12,13,14,15,16,17,18,19,20,  # A-K
  21,22,23,24,25,26,27,28,29,30,31,32,  # L-W
  33,34,35, 0, 0, 0, 0, 0,              # X-Z
   0,36,37,38,39,40,41,42,43,44,45,46,  # a-k
  47,48,49,50,51,52,53,54,55,56,57,58,  # l-w
  59,60,61);                            # x-z

@days_per_month = (0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334);

@queue_times = (60, 5*60, 15*60, 30*60, 60*60, 3*60*60, 6*60*60,
                12*60*60, 24*60*60);



##################################################
#                   Subroutines                  #
##################################################

# For handling very large volumes, the number of gigabytes is
# passed as a separate argument.

sub print_volume_rounded {
my($g) = pop @_;
my($x) = pop @_;

while ($x > $gig)
  {
  $g++;
  $x -= $gig;
  }

# Values < 1 GB

if ($g <= 0)
  {
  if ($x < 10000)
    {
    printf("%6d", $x);
    }
  elsif ($x < 10000000)
    {
    printf("%4dKB", ($x + 512)/1024);
    }
  else
    {
    printf("%4dMB", ($x + 512*1024)/(1024*1024));
    }
  }

# Values between 1GB and 10GB are printed in MB

elsif ($g < 10)
  {
  printf("%4dMB", ($g * 1024) + ($x + 512*1024)/(1024*1024));
  }

# Handle values over 10GB

else
  {
  printf("%4dGB", $g + ($x + $gig/2)/$gig);
  }
}


sub format_time {
my($t) = pop @_;
my($s) = $t % 60;
$t /= 60;
my($m) = $t % 60;
$t /= 60;
my($h) = $t % 24;
$t /= 24;
my($d) = $t % 7;
my($w) = $t/7;
my($p) = "";
$p .= "$w"."w" if $w > 0;
$p .= "$d"."d" if $d > 0;
$p .= "$h"."h" if $h > 0;
$p .= "$m"."m" if $m > 0;
$p .= "$s"."s" if $s > 0 || $p eq "";
$p;
}


# Given a log date/time, compute seconds since 1970

sub seconds {
my($y,$mo,$d,$h,$mi,$s) = (pop @_) =~
  /(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)/;
my($leap) = ($y % 4)? 0 : 1;
$d += $days_per_month[$mo-1];
$d-- unless ($leap && $mo > 2);
$y -= 1970;
$d += $y * 365 + ($y+1)/4;                 # NB This relies on use integer.
$s += 60 * ($mi + 60 * ($h + $d * 24));
$s;
}


# Given a message id, compute seconds

sub id_seconds {
my($id) = substr((pop @_), 0, 6);
my($s) = 0;
my(@c) = split(//, $id);
while($#c >= 0) { $s = $s * 62 + $tab62[ord(shift @c) - ord('0')] }
$s;
}


# Print time on queue information

sub print_queue_times {
no integer;
my($string,$array) = @_;

$printed_one = 0;
$cumulative_percent = 0;
$queue_unknown += keys %arrival_time;

$queue_total = $queue_more_than;
for ($i = 0; $i <= $#queue_times; $i++) { $queue_total += $$array[$i] }

my($temp) = "Time spent on the queue: $string";
printf ("%s\n%s\n\n", $temp, "-" x length($temp));

for ($i = 0; $i <= $#queue_times; $i++)
  {
  if ($$array[$i] > 0)
    {
    $percent = ($$array[$i] * 100)/$queue_total;
    $cumulative_percent += $percent;
    printf("%s %4s   %6d %5.1f%%  %5.1f%%\n",
      $printed_one? "     " : "Under",
      &format_time($queue_times[$i]),
      $$array[$i], $percent, $cumulative_percent);
    $printed_one = 1;
    }
  }

if ($queue_more_than > 0)
  {
  $percent = ($queue_more_than * 100)/$queue_total;
  $cumulative_percent += $percent;
  printf("Over  %4s   %6d %5.1f%%  %5.1f%%\n",
    &format_time($queue_times[$#queue_times]),
    $queue_more_than, $percent, $cumulative_percent);
  }

#printf("Unknown   %6d\n", $queue_unknown) if $queue_unknown > 0;
print "\n";
}


# Print histogram

sub print_histogram {
my($text) = shift;
my(@interval_count) = @_;
my($maxd) = 0;

for ($i = 0; $i < $hist_number; $i++)
  { $maxd = $interval_count[$i] if $interval_count[$i] > $maxd; }

$scale = int(($maxd + 25)/50);
$scale = 1 if $scale == 0;

if ($text eq "Deliveries")
  {
  $type = ($scale == 1)? "delivery" : "deliveries";
  }
else
  {
  $type = ($scale == 1)? "message" : "messages";
  }

my($temp) = sprintf("$text per %s (each dot is $scale $type)",
  ($hist_interval == 60)? "hour" :
  ($hist_interval == 1)?  "minute" : "$hist_interval minutes");

printf("%s\n%s\n\n", $temp, "-" x length($temp));

$hour = 0;
$minutes = 0;
for ($i = 0; $i < $hist_number; $i++)
  {
  $c = $interval_count[$i];

  # If the interval is an hour (the maximum) print the starting and
  # ending hours as a label. Otherwise print the starting hour and
  # minutes, which take up the same space.

  if ($hist_opt == 1)
    {
    printf("%02d-%02d", $hour, $hour + 1);
    $hour++;
    }
  else
    {
    if ($minutes == 0)
      { printf("%02d:%02d", $hour, $minutes) }
    else
      { printf("  :%02d", $minutes) }
    $minutes += $hist_interval;
    if ($minutes >= 60)
      {
      $minutes = 0;
      $hour++;
      }
    }

  printf(" %6d %s\n", $c, "." x ($c/$scale));
  }
print "\n";
}



# Print league table

sub print_league_table {
my($text) = $_[0];
my($m_count) = $_[1];
my($m_data) = $_[2];
my($name) = ($topcount == 1)? "$text" : "$topcount ${text}s";
my($temp) = "Top $name by message count";
printf ("%s\n%s\n\n", $temp, "-" x length($temp));

$count = 1;
foreach $key (sort
               {
               $$m_count{$b} <=> $$m_count{$a} ||
               $$m_data{$b}  <=> $$m_data{$a}  ||
               $a cmp $b
               }
             keys %{$m_count})
  {
  printf("%7d %10d   %s\n", $$m_count{$key}, $$m_data{$key}, $key);
  last if $count++ >= $topcount;
  }

$temp = "Top $name by volume";
printf ("\n%s\n%s\n\n", $temp, "-" x length($temp));

$count = 1;
foreach $key (sort
               {
               $$m_data{$b}  <=> $$m_data{$a}  ||
               $$m_count{$b} <=> $$m_count{$a} ||
               $a cmp $b
               }
             keys %{$m_count})
  {
  printf("%7d %10d   %s\n", $$m_count{$key}, $$m_data{$key}, $key);
  last if $count++ >= $topcount;
  }

print "\n";
}




##################################################
#                 Main Program                   #
##################################################

$delayed_count = 0;
$relayed_unshown = 0;
$show_errors = 1;
$show_relay = 1;
$show_transport = 1;
$queue_more_than = 0;
$queue_unknown = 0;
$topcount = 50;
$local_league_table = 1;
$hist_opt = 1;
$begin = "9999-99-99 99:99:99";
$end = "0000-00-00 00:00:00";
$gig = 1024 * 1024 * 1024;

# Decode options

while (@ARGV > 0 && substr($ARGV[0], 0, 1) eq '-')
  {
  if    ($ARGV[0] =~ /^\-h(\d+)$/) { $hist_opt = $1 }
  elsif ($ARGV[0] =~ /^\-ne$/)     { $show_errors = 0 }
  elsif ($ARGV[0] =~ /^\-nr(.?)(.*)\1$/)
    {
    if ($1 eq "") { $show_relay = 0 } else { $relay_pattern = $2 }
    }
  elsif ($ARGV[0] =~ /^\-q([,\d\+\-\*\/]+)$/)
    {
    @queue_times = split(/,/, $1);
    foreach $q (@queue_times) { $q = eval($q) + 0 }
    @queue_times = sort { $a <=> $b } @queue_times;
    @queue_times = () if ($#queue_times == 0 && $queue_times[0] == 0);
    }
  elsif ($ARGV[0] =~ /^\-nt$/)     { $show_transport = 0 }
  elsif ($ARGV[0] =~ /^\-t(\d+)$/) { $topcount = $1 }
  elsif ($ARGV[0] =~ /^\-tnl$/) { $local_league_table = 0 }
  else
    {
    print "Eximstats: Unknown or malformed option $ARGV[0]\n"
      unless $ARGV[0] =~ /^\-help$/;
    print "  Valid options are:\n";
    print "    -h<number>   histogram divisions per hour\n";
    print "                 default is 1, 0 suppresses histograms\n";
    print "    -ne          don't display error information\n";
    print "    -nr          don't display relaying information\n";
    print "    -nr/pattern/ don't display relaying information that matches\n";
    print "    -nt          don't display transport information\n";
    print "    -q<list>     list of times for queuing information\n";
    print "                 single 0 item suppresses\n";
    print "    -t<number>   display top <number> sources/destinations\n";
    print "                 default is 50, 0 suppresses top listing\n";
    print "    -tnl         omit local sources/destinations in top listing\n";
    exit 1;
    }
  shift;
  }

# Initialize slots for queue times

for ($i = 0; $i <= $#queue_times; $i++)
  {
  $queue_bin[$i] = 0;
  $remote_queue_bin[$i] = 0;
  }

# Compute the number of slots for the histogram

if ($hist_opt > 0)
  {
  if ($hist_opt > 60 || 60 % $hist_opt != 0)
    {
    print "Eximstats: -h must specify a factor of 60\n";
    exit 1;
    }
  $hist_interval = 60/$hist_opt;
  $hist_number = (24*60)/$hist_interval;
  @received_interval_count = (0) x $hist_number;
  @delivered_interval_count = (0) x $hist_number;
  }

# Scan the input files and collect the data

$received_data_total = 0;
$received_data_gigs = 0;
$received_count_total = 0;

$delivered_data_total = 0;
$delivered_data_gigs = 0;
$delivered_count_total = 0;

while (<>)
  {
  next if ! /^[\d]{4}\-[\d]{2}\-[\d]{2}\s[\d]{2}:[\d]{2}:[\d]{2}/;
  $tod = substr($_, 0, 19);
  $id = substr($_, 20, 16);

  $begin = $tod if $tod lt $begin;
  $end = $tod if $tod gt $end;

  next if length($_) < 38;
  $flag = substr($_, 37, 2);

  if ($flag eq "<=")
    {
    ($thissize) = /\sS=([0-9]+)( |$)/;
    $thissize = 0 if !defined($thissize);
    $size{$id} = $thissize;
    ($host) = /\sH=(\S+)/;
    if (defined $host)
      {
      if ($show_relay)                   # Save incoming information
        {                                # in case it becomes interesting
        ($from) = /^.{40}(\S+)/;         # later, when delivery lines are read
        ($ip) = /\sH=\S+(?:(?=\s\()\s\S+)?(\s\[[^]]*\])/;
        $ip = "" if !defined $ip;
        $from_host{$id} = "$host$ip";
        $from_address{$id} = $from;
        }
      }
    else
      {
      $host = "local";
      ($user) = /\sU=(\S+)/;
      if (defined $user)
        {
        $received_count_user{$user}++;
        $received_data_user{$user} = 0 if !defined $received_data_user{$user};
        $received_data_user{$user} += $thissize;
        }
      }

    $received_count{$host}++;
    $received_data{$host} = 0 if !defined $received_data{$host};
    $received_data{$host} += $thissize;
    $received_count_total++;
    $received_data_total += $thissize;
    if ($received_data_total > $gig)
      {
      $received_data_gigs++;
      $received_data_total -= $gig;
      }

    $arrival_time{$id} = $tod if $#queue_times >= 0;
    if ($hist_opt > 0)
      {
      ($hour,$minute) = $tod =~ /^\S+ ([0-9]{2}):([0-9]{2}):/;
      $received_interval_count[($minute+$hour*60)/$hist_interval]++;
      }
    }

  elsif ($flag eq "=>")
    {
    $size = $size{$id};
    $size = 0 if !defined $size;
    ($host) = /\sH=(\S+)/;
    if (defined $host)
      {
      $remote_delivered{$id} = 0 if !defined($remote_delivered{$id});
      $remote_delivered{$id}++;

      # Determine relaying address if either only one address listed,
      # or two the same. If they are different, it implies a forwarding
      # or aliasing, which is not relaying. Note that for multi-aliased
      # addresses, there may be a further address between the first
      # and last.

      if ($show_relay && defined $from_host{$id})
        {
        ($ip) = /\sH=\S+(?:(?=\s\()\s\S+)?(\s\[[^]]*\])/;
        $ip = "" if !defined $ip;
        if (/^.{40}(\S+)(?:\s+\([^)]\))?\s+<([^>]+)>/)
          { $old = $1; $new = $2 }
        else
          { $old = $new = ""; }

        if ("\L$new" eq "\L$old")
          {
          ($old) = /^.{40}(\S+)/ if $old eq "";
          $key = "H=\L$from_host{$id}\E A=\L$from_address{$id}\E => " .
            "H=\L$host\E$ip A=\L$old\E";
          if (!defined $relay_pattern || $key !~ /$relay_pattern/o)
            {
            $relayed{$key} = 0 if !defined $relayed{$key};
            $relayed{$key}++;
            }
          else { $relayed_unshown++ }
          }
        }
      }
    else
      {
      $host = "local";
      my($rest) = substr($_,40);
      if (($user) = split(($rest =~ /</)? ' <' : ' ', $rest))
        {
        if ($user =~ /^[\/|]/)
          {
          my($parent) = $_ =~ /(<[^@]+@?[^>]*>)/;
          $user = "$user $parent" if defined $parent;
          }
        $delivered_count_user{$user}++;
        $delivered_data_user{$user} = 0 if !defined $delivered_data_user{$user};
        $delivered_data_user{$user} += $size;
        }
      }

    $delivered_count{$host}++;
    $delivered_data{$host} = 0 if !defined $delivered_data{$host};
    $delivered_data{$host} += $size;

    $delivered_count_total++;
    $delivered_data_total += $size;

    if ($delivered_data_total > $gig)
      {
      $delivered_data_gigs++;
      $delivered_data_total -= $gig;
      }

    if ($show_transport)
      {
      ($transport) = /\sT=(\S+)/;
      $transport = ":blackhole:" if !defined $transport;
      $transported_data{$transport} = 0
        if !defined $transported_data{$transport};
      $transported_data_gigs{$transport} = 0
        if !defined $transported_data_gigs{$transport};
      $transported_count{$transport}++;
      $transported_data{$transport} += $size;
      if ($transported_data{$transport} > $gig)
        {
        $transported_data_gigs{$transport}++;
        $transported_data{$transport} -= $gig;
        }
      }
    if ($hist_opt > 0)
      {
      ($hour,$minute) = $tod =~ /^\S+ ([0-9]{2}):([0-9]{2}):/;
      $delivered_interval_count[($minute+$hour*60)/$hist_interval]++;
      }
    }

  elsif ($flag eq "==" && defined($size{$id}) && !defined($delayed{$id}))
    {
    $delayed_count++;
    $delayed{$id} = 1;
    }

  elsif ($flag eq "**")
    {
    $had_error{$id} = 1 if defined ($size{$id});
    if ($show_errors)
      {
      ($error) = substr($_, 40);
      $errors_count{$error}++;
      }
    }

  elsif (/Completed$/)
    {
    if ($#queue_times >=0)
      {
      $arrival = $arrival_time{$id};
      delete($arrival_time{$id});
      $queued = &seconds($tod) -
        (defined($arrival)? &seconds($arrival) : &id_seconds($id));
      for ($i = 0; $i <= $#queue_times; $i++)
        {
        if ($queued < $queue_times[$i])
          {
          $queue_bin[$i]++;
          $remote_queue_bin[$i]++ if $remote_delivered{$id};
          last;
          }
        }
      $queue_more_than++ if $i > $#queue_times;
      }

    if ($show_relay)
      {
      delete($from_host{$id});
      delete($from_address{$id});
      }
    delete($size{$id});
    }
  }

if ($begin eq "9999-99-99 99:99:99")
  {
  print "**** No valid log lines read\n";
  exit 1;
  }

print "\nExim statistics from $begin to $end\n";

# Print grand totals

print "\nGrand total summary";
print "\n-------------------";
print "\n                                                       At least one address";
print "\n  TOTAL               Volume    Messages    Hosts      Delayed       Failed";
print "\n  Received            ";
&print_volume_rounded($received_data_total, $received_data_gigs);

  {
  no integer;
  $failed_count = keys %had_error;
  printf("      %6d     %4d  %6d %4.1f%% %6d %4.1f%%",
    $received_count_total,
    scalar(keys %received_data),
    $delayed_count,
    ($received_count_total == 0)?
      0 : ($delayed_count*100)/$received_count_total,
    $failed_count,
    ($received_count_total == 0)?
      0 : ($failed_count*100)/$received_count_total);
  }

print "\n  Delivered           ";
&print_volume_rounded($delivered_data_total, $delivered_data_gigs);
printf("      %6d     %4d\n", $delivered_count_total,
  scalar(keys %delivered_data));
print "\n";

# Print totals by transport if required

if ($show_transport)
  {
  print "Deliveries by transport\n";
  print "-----------------------";
  print "\n                      Volume    Messages";

  foreach $key (sort keys %transported_data)
    {
    printf( "\n  %-16s    ", $key);
    &print_volume_rounded($transported_data{$key},
      $transported_data_gigs{$key});
    printf( "      %6d", $transported_count{$key});
    }
  print "\n\n";
  }

# Print the deliveries per interval as a histogram, unless configured not to.
# First find the maximum in one interval and scale accordingly.

if ($hist_opt > 0)
  {
  &print_histogram("Messages received", @received_interval_count);
  &print_histogram("Deliveries", @delivered_interval_count);
  }

# Print times on queue if required

if ($#queue_times >= 0)
  {
  &print_queue_times("all messages", \@queue_bin);
  &print_queue_times("messages with at least one remote delivery",
    \@remote_queue_bin);
  }

# Print relay information if required

if ($show_relay)
  {
  if (scalar(keys %relayed) > 0 || $relayed_unshown > 0)
    {
    $shown = 0;
    $spacing = "";
    print "Relayed messages\n";
    print "----------------\n\n";

    foreach $key (sort keys %relayed)
      {
      $count = $relayed{$key};
      $shown += $count;
      $key =~ s/[HA]=//g;
      ($one,$two) = split(/=> /, $key);
      printf("%7d %s\n      => %s\n", $count, $one, $two);
      $spacing = "\n";
      }
    print "${spacing}Total: $shown (plus $relayed_unshown unshown)\n";
    }
  else
    {
    print "No relayed messages\n";
    print "-------------------\n";
    }
  print "\n";
  }

# If the topcount is zero, print no league tables

if ($topcount > 0)
  {
  &print_league_table("sending host", \%received_count, \%received_data);
  &print_league_table("local sender", \%received_count_user,
    \%received_data_user) if $local_league_table;
  &print_league_table("destination", \%delivered_count, \%delivered_data);
  &print_league_table("local destination", \%delivered_count_user,
    \%delivered_data_user) if $local_league_table;
  }

# Omit error statistics if configured out

if ($show_errors)
  {
  $total_errors = 0;

  if (scalar(keys %errors_count) != 0)
    {
    print "List of errors\n";
    print "--------------\n\n";

    foreach $key (sort keys %errors_count)
      {
      chop($text = $key);
      printf("%5d ", $errors_count{$key});
      $total_errors += $errors_count{$key};
      while (length($text) > 65)
        {
        ($first,$rest) = $text =~ /(.{50}\S*)\s+(.+)/;
        last if !$first;
        printf("%s\n      ", $first);
        $text = $rest;
        }
      printf("%s\n\n", $text);
      }
    }

  $temp = "Errors encountered: $total_errors";
  printf("%s\n%s\n\n", $temp, "-" x length($temp));
  }

# End of eximstats
