#! /usr/bin/perl -w
#
# Copyright 1999  August Hoerandl (hoerandl@elina.htlw1.ac.at)
# Copyright 1999  August Hoerandl (august.hoerandl@gmx.at)
#                 http://elina.htlw16.ac.at/~hoerandl
#
# parts from search.pl which is
# Copyright 1999 Jose M. Vidal
# Jose M. Vidal, vidal@multiagent.com, http://jmvidal.ece.sc.edu
#
# This program is free software.  You can redistribute it and/or modify
# it under the terms of the GNU General Public License
#
# create database with meta info 
#
# actions
#  read database
#    read to @allRecords
#  read metadata
#    read to $meta{$URL}, $date{$URL}, $error{$URL} 
#  check all (expired) urls
#    based on $date{$URL}
#  dump metadata 
#  dump database with meta info
#  dump errordata
#
# History
#  0.1 
#  4. Sept. 1999 - first version
#  0.2
#  12. Sept. 1999 - little bug with multiple line meta corrected
#                   added option & getoptions
#  0.3
#  21. Sept. 1999 - merged back file detection from Jose
#
#  0.4
#  26. Oct 1999 - added code for: meta Classification
#  31. Oct 1999 - added noproxy, showrecheck, $time, error.html
#
#  0.5 
#  12. Nov 1999 - added  checkheadline
#                 strip metainfo (multiple blanks, html)
#                 convert iso

$version = "0.5";

$database   = "/usr/local/httpd/cgi-bin/urls.db";      # Name of db file.
$newdb = "";             # result: database
$metadata = "meta.db";   # result: metainfo
$errordata = "error.db"; # result: urls with error

$error_recheck = 10; # number of days for recheck after error
$age_recheck = 40;  # number of days for recheck 
# add some randomness - so we won't fetch all urls on the same day 
$random = .5;   # random number of days added (factor)

$dumpfreq = 10;  # when to dump metadata (number of urls read) 
$maxurls = 2;   # max url to fetch recursive (frames)
$maxcheck = 1000; # max number of urls to check in one run

$sleep = 2; # time to sleep between url requests
$debug = 0; # set to 1 for debug output
$showrecheck = 0; #  set to 1 to see next recheck

$checkerror = 0; # check all urls with errorcount > 0
$reseterror = 0; # reset all error counters - only with $checkerror
$checknometa = 0; # check all urls with meta data = ""

$checkheadline = 0; # add <h1>, <h2>, ... to meta info - see $whichheadline
$whichheadline = "1-2"; # which headlines to check

$noproxy = 0; # dont use proxy from environment

$test1 = 0; # simple test
$test2 = 0; # more tests

# ------------------------------------------------------------------
# no user serviceable parts below
# ------------------------------------------------------------------

use LWP;
use HTTP::Request::Common;
use Getopt::Long;

# ------------------------------------------------------------------

$numlines_bk2site = 9;   #  number of lines - standard
$numlines_metadata = 10; #  number of lines - with metadata

# ------------------------------------------------------------------

$countmeta = 0;
$counthead = 0;

# ------------------------------------------------------------------

sub read_database {
  open(DB,"<$database") ||
    die ("Can't open $database; $!");
  $db_line1 = <DB>;
  chop($db_line1);
  if ($db_line1 eq "#bk2site urls.db generated file"){
    printf "bk2site urls.db generated file\n" if $debug;
    $numlines = $numlines_bk2site;
  }
  else {
    printf "bk2site urls.db has already metadata\n" if $debug;
    $numlines = $numlines_metadata;
  };
  $db_line2 = <DB>;
  $db_line3 = <DB>;
  $db_line4 = <DB>;
  @allRecords = <DB>; # there are numlines lines per record,
                      # so mutiply index by 8

  close(DB);
  ## 0 is the type
  ## 1 is the ParentTitle
  ## 2 is the url or relative dir
  ## 3 is the Title
  ## 4 is the comment
  ## 5 is the creation time
  ## 6 is the last modified time
  ## 7 is the last visit time
  ## 8 is the number of user hits
  ## 9 is the meta info

  $typeN = 0;
  #  $parentTitleN = 1;
  $urlN = 2;
  #  $titleN = 3;
  #  $commentN = 4;
  #  $creationtimeN = 5;
  #  $modifiedtimeN = 6;
  #  $visittimeN = 7;
  #  $urlhitsN = 8;
  
#  $numlines = 10; ##0-9, as above

  #  my $numRecords = 15;  # test only
  my $numRecords = ($#allRecords + 1)/$numlines;

  my $i;
  for ($i=0; $i < $numRecords; ++$i){ #initialize meta
    my $cr = ($i * $numlines);
    next if (($allRecords[$cr + $typeN]) eq "FOLDER\n");
    my $url =    $allRecords[$cr + $urlN];
    chop($url);
    print "url $url added\n" if $debug;
    $error{$url} = 0;
    $date{$url} = 0;
    $meta{$url} = "";
  }
}

sub dump_database {
  print "\ndumping database\n";
  open(DB,">$newdb") ||
    die ("Can't open $newdb; $!");
#  print DB "#bk2site urls.db generated file metadata";
  if ($db_line1 eq "#bk2site urls.db generated file") {
    print DB "#bk2site urls.db generated file metadata\n";
  }
  else {
    print DB "$db_line1\n";
  }
  print DB "$db_line2";
  print DB "$db_line3";
  print DB "$db_line4";

  my $numRecords = ($#allRecords + 1)/$numlines;
  for ($i=0; $i < $numRecords; $i++) {
    my $cr = ($i * $numlines);
    my $url = $allRecords[$cr + $urlN];
    chop($url);
    for($j = 0; $j <  $numlines_bk2site; $j++) {
      print DB "$allRecords[$cr + $j]";
    }
    if (defined $error{$url} && $error{$url} > 0) {
      print DB "\n";
    } elsif (defined $meta{$url}) {
      print DB "$meta{$url}\n";
    } else {
      print DB "\n";
    }
  }
  close DB;
}

# ------------------------------------------------------------------
# add url + filename
sub make_url {
  local($url, $file) = @_;
  if ( $file =~ /^\//) {  # filename starting with /
    $url =~ /(.*\/\/[^\/]*)\//; # get only host part
    return $1 . $file;
  }
  if ( $url =~ /\/$/ ) { # url ends with /
    return $url . $file;
  } else {
    return $url . "/" . $file;
  }
}

sub conv_iso {
  local($line) = @_;
  $line =~ s/&AElig;//ig;
  $line =~ s/&Aacute;//ig;
  $line =~ s/&Acirc;//ig;
  $line =~ s/&Agrave;//ig;
  $line =~ s/&Atilde;//ig;
  $line =~ s/&Ccedil;//ig;
  $line =~ s/&Eacute;//ig;
  $line =~ s/&Egrave;//ig;
  $line =~ s/&Euml;//ig;
  $line =~ s/&Iacute;//ig;
  $line =~ s/&Icirc;//ig;
  $line =~ s/&Igrave;//ig;
  $line =~ s/&Iuml;//ig;
  $line =~ s/&Ntilde;//ig;
  $line =~ s/&Oacute;//ig;
  $line =~ s/&Ocirc;//ig;
  $line =~ s/&Ograve;//ig;
  $line =~ s/&Oslash;//ig;
  $line =~ s/&Uacute;//ig;
  $line =~ s/&Ugrave;//ig;
  $line =~ s/&Yacute;//ig;
  $line =~ s/&aacute;//ig;
  $line =~ s/&acirc;//ig;
  $line =~ s/&acute;//ig;
  $line =~ s/&aelig;//ig;
  $line =~ s/&agrave;//ig;
  $line =~ s/&aring;//ig;
  $line =~ s/&atilde;//ig;
  $line =~ s/&brvbar;//ig;
  $line =~ s/&ccedil;//ig;
  $line =~ s/&cedil;//ig;
  $line =~ s/&cent;//ig;
  $line =~ s/&copy;//ig;
  $line =~ s/&curren;//ig;
  $line =~ s/&deg;//ig;
  $line =~ s/&eacute;//ig;
  $line =~ s/&ecirc;//ig;
  $line =~ s/&egrave;//ig;
  $line =~ s/&euml;//ig;
  $line =~ s/&frac12;//ig;
  $line =~ s/&frac14;//ig;
  $line =~ s/&frac34;//ig;
  $line =~ s/&iacute;//ig;
  $line =~ s/&icirc;//ig;
  $line =~ s/&igrave;//ig;
  $line =~ s/&iexcl;//ig;
  $line =~ s/&iquest;//ig;
  $line =~ s/&iuml;//ig;
  $line =~ s/&laquo;/\/ig;
  $line =~ s/&macr;//ig;
  $line =~ s/&micro;//ig;
  $line =~ s/&middot;//ig;
  $line =~ s/&nbsp;//ig;
  $line =~ s/&not;//ig;
  $line =~ s/&ntilde;//ig;
  $line =~ s/&oacute;//ig;
  $line =~ s/&ocirc;//ig;
  $line =~ s/&ograve;//ig;
  $line =~ s/&ordf;//ig;
  $line =~ s/&ordm;//ig;
  $line =~ s/&oslash;//ig;
  $line =~ s/&otilde;//ig;
  $line =~ s/&para;//ig;
  $line =~ s/&pound;//ig;
  $line =~ s/&plusmn;//ig;
  $line =~ s/&laquo;/\/ig;
  $line =~ s/&reg;//ig;
  $line =~ s/&sect;//ig;
  $line =~ s/&shy;//ig;
  $line =~ s/&sup1;//ig;
  $line =~ s/&sup2;//ig;
  $line =~ s/&sup2;//ig;
  $line =~ s/&uacute;//ig;
  $line =~ s/&ucirc;//ig;
  $line =~ s/&ugrave;//ig;
  $line =~ s/&uml;//ig;
  $line =~ s/&yacute;//ig;
  $line =~ s/&yen;//ig;
  $line =~ s/&Auml;//ig;
  $line =~ s/&auml;//ig;
  $line =~ s/&Ouml;//ig;
  $line =~ s/&ouml;//ig;
  $line =~ s/&Uuml;//ig;
  $line =~ s/&uuml;//ig;
  $line =~ s/&szlig;//ig;
  $line =~ s/&sect;//ig;
  $line =~ s/&para;//ig;
  $line =~ s/&copy;//ig;
  $line =~ s/&iexcl;//ig;
  $line =~ s/&iquest;//ig;
  $line =~ s/&cent;//ig;
  $line =~ s/&pound;//ig;
  $line =~ s/&times;//ig;
  $line =~ s/&plusmn;//ig;
  $line =~ s/&divide;//ig;
  $line =~ s/&not;//ig;
  $line =~ s/&mu;//ig;
  $line =~ s/&Ae;//ig;
  $line =~ s/&ae;//ig;
  $line =~ s/&Oe;//ig;
  $line =~ s/&oe;//ig;
  $line =~ s/&Ue;//ig;
  $line =~ s/&ue;//ig;
  $line =~ s/&sz;//ig;

#  while ($line =~ /&\#([0-9a-f]*);/i) {
#    my $c = chr($1);
#    $line =~ s/&\#[0-9a-f]*;/$c/i;
#  }
  
  return $line;
}


sub get_meta {
  local($url,$count) = @_;

  print "getting $url\n" if $debug;

  local($ua) = LWP::UserAgent->new;

  if (! $noproxy) {
    $ua->env_proxy();
  }
  local($page) = $ua->request(GET "$url");
  local($metainfo) = "";
  
  $error = "";
  if ($page->is_error) {
    $error = $page->status_line;
    return "";
  }

  $refresh = $page->freshness_lifetime;
  local($old) = "";
  local(@lines) = split(/^/,$page->content());
  for (@lines) {
    last if (!$checkheadline && /body/i);
    $_ = $old . " ". $_ if ($old ne ""); # add old unfinished line
    print "checking [$_]\n" if $debug; 
    if ( /meta/i && 
	 (/keywords/i || /description/i ||
	  /classification/i || /page-topic/i )) {
      if (/content *= *[\"\']([^\"\']*)[\"\']/i) {
	print "meta = $1\n" if $debug;
	$metainfo .= "$1 ";
	$old = "";
	$countmeta++;
      } else {
	# line with meta but no content - keep & merge with next line
	$old = $_;
	chop($old);
	print "no content: $old\n" if $debug;
      }
    } elsif ($checkheadline && /<h[$whichheadline]/i) {
      print "head?: $_" if $debug;
      if (/<h([$whichheadline])[^>]*>(.*)<\/h[$whichheadline]>/i) { 
	print "headline $1 [$2]\n" if $debug;
	$metainfo .= "$2 ";
	$old = "";
	$counthead++;
      } else {
	$old = $_;
	chop($old);
	print "no headline: $old\n" if $debug;
      }
    } else {
      $old = "";
    }
    # look for: frame src="..."
    if (/frame/i && /src *= *\"([^\"]*)\"/i && $count < $maxurls) {
      local($tmp) = get_meta( &make_url($page->base,$1), $count+1);
      if ($error eq "") {
	$metainfo .= " $tmp";
      }
      $error = "";
    }
  }
  print "done $url\n" if $debug;
  $metainfo = &conv_iso($metainfo);
  $metainfo =~ s/<[^>]*>/ /g;
  #  $metainfo =~ s/,/ /g;
  $metainfo =~ s/[\0- ]/ /g;
  $metainfo =~ s/  +/ /g;
  return $metainfo;
}

sub check {
  local($url) = @_;
  $error = "";
  $res = &get_meta ($url, 1);
  if ($error ne "") { # this is a "bad" url
    print "  OOPS $error\n";
    $date{$url} = $time + 
      24*60*60*($error_recheck + rand($random*$error_recheck));
    $error{$url}++;
    $meta{$url} = $error;
  } else { # we have found the url
    print " [$res]\n";
    $error{$url} = 0;
    my $w = $time +  24*60*60*($age_recheck + rand($random*$age_recheck));
    if ($w > $refresh) {
      $date{$url} = $w;
    } else {
      $date{$url} = $refresh;
    }
    #  print "recheck in ", ($date{$url} - $time)/24/60/60, " days\n";
    $meta{$url} = $res;
  }
}

# ------------------------------------------------------------------

sub read_metadata {
  print "reading metadata $metadata\n";
  open(DB,"<$metadata") ||
    return;
  while ($url = <DB>) {
    chop $url;
    $error= <DB>;
    $date = <DB>;
    $meta = <DB>;
    if (defined $meta{$url}) {
      chop $error;
      chop $date;
      chop $meta;
      $error{$url}= $error;
      $date{$url} = $date;
      $meta{$url} = $meta;
      print "added [$url]\n" if $debug;
      if ($checkerror && $error > 0) {
	$date{$url} = $time - 1;
	$error{$url} = 1 if $reseterror;
      }
      if ($checknometa && $meta eq "") {
	$date{$url} = $time - 1;
      }
    } else {
      print "skipped $url\n";
    } 
  }
  close(DB);
}


sub dump_metadata {
  print "dumping metadata to $metadata\n";
  open(DB,">$metadata") ||
    die ("Can't open $metadata; $!");
  foreach $url (sort keys (%meta)) {
    print DB "$url\n";
    print DB "$error{$url}\n";
    print DB "$date{$url}\n";
    print DB "$meta{$url}\n";
    print "dumped [$url]\n" if $debug;
  }
  close(DB);
}

sub dump_errordata {
  print "dumping errordata to $errordata\n";
  open(DB,">$errordata") ||
    die ("Can't open $errordata; $!");
  foreach $url (sort keys (%meta)) {
    if ($error{$url} > 0) {
      print DB "$url\n";
      print DB "$error{$url}\n";
      print DB "$meta{$url}\n\n";
      print "dumped [$url]\n" if $debug;
    }
  }
  close(DB);
  &dump_errordata_html;
}

sub dump_errordata_html {
  print "dumping errordata to $errordata.html\n";
  open(DB,">$errordata.html") ||
    die ("Can't open $errordata.html; $!");
  print DB <<EOF;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> 
<html>
  <head>
    <TITLE>Errors</TITLE>
  </head>
  <BODY>
    <table border>
    <tr>
      <th>Error Count</th>
      <th>URL</th>
      <th>Error Message</th>
EOF

  $count = 0;

  foreach $url (sort keys (%meta)) {
    if ($error{$url} > 0) {
      $count++;
      print DB "<tr>\n";
      print DB " <td align=\"center\">$error{$url}</td>\n";
      print DB " <td><a href=\"$url\">$url</a></td>\n";
      print DB " <td>$meta{$url}</td>\n</tr>\n";
      print "dumped [$url]\n" if $debug;
    }
  }
  print DB "\n</table>\n";
  print DB "$count URLs";
  print DB "\n</body>\n";
  close(DB);
}

# ------------------------------------------------------------------

sub check_all {
  print "\n\n";
  my $all = 0;
  foreach $url (keys (%meta)) {
    $all++;
  }
  my $i = 0;
  my $checked = 0;
  foreach $url (keys (%meta)) {
    $i ++;
    if ($time > $date{$url}) {
      sleep($sleep) if ($sleep > 0);
      print "checking [$url]  ", $checked, "/", $i, "/", $all, "\n";
      &check($url);
      $checked ++;
      if ($checked % $dumpfreq == 0) {
	&dump_metadata;
	&dump_errordata;
      }
    } else {
      if ($showrecheck) {
	printf("recheck [$url] in %.0f days\n", ($date{$url}-$time)/24/60/60);
      }
    }
    last if ($checked > $maxcheck);
  }
  print STDERR "checked ", $checked, " of ",  $all, " urls\n";
  print STDERR "meta: $countmeta";
  print STDERR " headlines: $counthead" if $checkheadline;
  print STDERR "\n";
}

# ------------------------------------------------------------------

sub test1 {
  # test only
  check("http://localhost/~hoerandl/xxx.html");
  check("http://localhost/~hoerandl/xxxx.html");
  check("http://localhost/~hoerandl");
  check("http://localhost/~hoerandl/tk");
  check("http://elina/");
  check("http://elina/start");
  check("http://elina.htlw1.ac.at/linux");
  check("http://www.linux.org/");
  check("http://www.linux.de");
  check("http://www.tu-harburg.de/~semb2204/dlhp/index.html");
  die "";
}

sub test2 {
  check("ftp://ftp.rubyriver.com/pub/jhardin/antispam/procmail-security.html");
  check("http://1stlinuxsearch.com/");
  check("http://www.linux.org/");
  check("http://www.linux.de");
  check("http://www.tu-harburg.de/~semb2204/dlhp/index.html");
  check("http://www.isc.org/dhcp.html");
  check("http://www.schulchemie-online.de/");
  check("http://linux.s-one.net.sg/resource/");
}

# ------------------------------------------------------------------

sub usage {
  die <<EOF;
check-meta.pl [options]
  create database with meta info / version $version
  --age_recheck=NUM      number of days for recheck ($age_recheck)
  --checkerror           check all urls with errorcount > 0 ($checkerror)
  --checkheadline        use headlines as meta info ($checkheadline)
  --checknometa          check all urls with meta data "" ($checknometa)
  --database=name        Database file ($database)
  --debug                show debug output ($debug)
  --dumpfreq             when to dump metadata - number of urls read ($dumpfreq) 
  --error_recheck=NUM    number of days for recheck after error ($error_recheck)
  --errordata=name       Database file with errors ($errordata)
  --help                 Show this help
  --maxcheck             max number of urls to check in one run ($maxcheck)
  --maxurls=NUM          max number of urls to fetch recursive ($maxurls)
  --metadata=name        Database file for meta data ($metadata)
  --newdb=name           output file (same as database)
  --noproxy              dont use a proxy ($noproxy)
  --random=NUM           random number of days added - factor ($random)
  --reseterror           reset all error counters - only with checkerror
  --sleep=NUM            time to sleep between url requests ($sleep)
  --whichheadline=FRM-TO which headlines to include ($whichheadline)
EOF
} 

# ------------------------------------------------------------------
#
# program starts here
#

$res = GetOptions (
		   "age_recheck=i" => \$age_recheck,
		   "checkerror!" => \$checkerror,
		   "checkheadline!" => \$checkheadline,
		   "checknometa!" => \$checknometa,
		   "database=s" => \$database,
		   "debug!" => \$debug,
		   "dumpfreq=i" => \$dumpfreq,
		   "error_recheck=i" => \$error_recheck,
		   "errordata=s" => \$errordata,
		   "help!" => \$showhelp,
		   "maxcheck=i" => \$maxcheck,
		   "maxurls=i" => \$maxurls,
		   "metadata=s" => \$metadata,
		   "newdb=s" => \$newdb,
		   "noproxy!" => \$noproxy,
		   "random=f" => \$random,
		   "reseterror!" => \$reseterror,
		   "showrecheck!" => \$showrecheck,
		   "sleep=i" => \$sleep,
		   "whichheadline=s" => \$whichheadline,
		   # internal use only
		   "test1!" => \$test1,
		   "test2!" => \$test2,
		  );
&usage if (!$res || $showhelp || $#ARGV >= 0);
$time = time;

if ($test1) { 
  &test1;
  exit;
}
if ($test2) {
  &test2;
  exit;
}

if ($newdb eq "") {
  $newdb = $database;
}

&read_database;
&read_metadata;
&dump_errordata;

&check_all;

&dump_metadata;
&dump_database;
&dump_errordata;
