#!/usr/bin/perl -w
#
#   $Id: bookmarks-converter.pl,v 1.9 1998/10/12 20:43:16 ch Exp $
#
#	bookmarks-convert - a html to bookmarks file converter
#   Copyright (C) 1998 Christian Hammers <ch@lathspell.westend.com>
#
#   This program is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

use strict;
use Getopt::Long;

my(@input_names, $inok);
my($output_name, $outok);
my(%optctl);
local(*TMP);

sub help {
	print "Usage: bookmarks-convert -i <type> -o <type> SOURCE... DEST\n".
		  "  or:  bookmarks-convert -r -o <type> SOURCE DEST\n".
		  "Convert bookmark file[s] SOURCE to one bookmark file DEST.\n".
		  "\n".		  
		  "  -i, --input-type\t\tinput type\n".
		  "  -o, --output-type\t\toutput type\n".
		  "  -r, --recursive\t\tfetch recursive beginning with SOURCE. Type is html.\n".
		  "\n".
		  "The known types are:\n".
		  "  html\t\t\tHTML-<dl>-List. See /usr/share/debian-bookmarks/README.html !\n".
		  "  ns1\t\t\tNetscape Navigator/Communicator 4.x\n".
		  "  lynx\t\t\tLynx 2.7\n".
		  "  tmp\t\t\tJust plain <dl> tags. For debugging purposes only.\n".
		  "\n".
		  "Report bugs, suggestions or comments to ch\@debian.org\n";
	exit 1;
}

sub parse_command_line {
	my $i;
 	if (GetOptions(\%optctl,"i|input-type=s","o|output-type=s","r|recursive!")==0) {
		print "invalid options\n"; help;
	}
	if (!defined $optctl{"o"}) {
		print "output type not specified\n"; help;
	}
	if (!defined $optctl{'r'} && !defined $optctl{'i'}) {
		print "input type not specified\n"; help;
	}
	@input_names = @ARGV;
	if (!defined $input_names[0]) {
		print "no input file(s) specified\n"; help;
	}
	$output_name=pop @input_names;
	if (!defined $output_name) {
		print "no output file specified\n"; help;
	}
}

#################################### RECURSIVE IN #############################
sub recursive_html2tmp($) {
	local(*IN);
	my($s,$dir,$link,$space,$olddir);

	# to allow relativ addressing i.e. href="support.html"
	$olddir=`pwd`; chop($olddir);

	# is the page in the local directory or in a subdirectory ?
	$link=shift;
	if ($link =~ /^(.*)\/(.*)$/) {
		$dir=$1;
		$link=$2;
		chdir($dir);
	}
		
	open(IN,$link) || die $?;

	print "-->$link<--\n";

	while (<IN>) {
		# <DL> and </DL>
		if (/^(\s*)<(\/)?dl>/i) {				
			if (defined $2) {$s=$2;} else {$s=""};	
			print TMP "$1<",$s,"dl><p>\n";
		}

		# <DT><EM>comment</EM>
		if (/^(\s*)<dt><em>(.*?)<\/em>/i) {		
			print TMP "$1<dt><em>$2<\/em>\n";
		}

		# <DT><A HREF=link>comment</A>
		if (/^(\s*)<dt><a href=(.*)>(.*)<\/a>/i) {
			$space = $1;
			$link = $2;
			$s = $3;
			$link =~ s/"//g;
			if (($link !~ /:\/\//) and ($link !~ /mailto:/i)) {
				# a local link -> I follow
				print "following...$link ($s)!\n";
				print TMP "<dt><em>$s</em>\n";
				recursive_html2tmp($link);
			} else {	
				# a non-local URL.
				print TMP "$space<dt><a href=\"$link\">$s<\/a>\n";
			}
		}
	}

	close(IN);
	# back to where I was coming from
	chdir($olddir);
	$inok=1;
}

#################################### IN #######################################

sub html2tmp($) {
	local(*IN,*TMP);

	open(IN,shift) || die $?;
	open(TMP,">>$output_name.cvt-tmp") || die $?;
	
	while (<IN>) {
		if (/^\s*<\/?d[tld]>/i) {
			print TMP $_;
		}	
	}
	close(IN);
	close(TMP);
	$inok=1;
}

sub ns12tmp($) {
	local(*IN,*TMP);

	open(IN,shift) || die $?;
	open(TMP,">>$output_name.cvt-tmp") || die $?;
	
	while (<IN>) {
		if (/<\/?dl>/i) {
			print TMP $_;
		}	
		if (/<dd>/i) {
			print TMP $_;
		}
		if (/(\s*)<dt><a href=([^ ]+)[^>]+>([^<]*)<\/a>/i) {
			print TMP $1,"<dt><a href=$2>$3</a>\n";
		}
		if (/(\s*) <dt> \s* <h3[^>]*> \s* ([^<]*) \s* <\/h3>/ix)  {
			print TMP $1,"<dt>$2\n";
		}
	}

	close(IN);
	close(TMP);
	$inok=1;
}


sub lynx2tmp($) {
	local(*IN,*TMP);

	open(IN,shift) || die $?;
	open(TMP,">>$output_name.cvt-tmp") || die $?;
	
	while (<IN>) {
		if (/^<LI>(.*)/i) {
			print TMP "<dt>$1\n";
		}	
	}

	close(IN);
	close(TMP);
	$inok=1;
}

##################################### OUT ################################

sub tmp2ns1 {
	local(*TMP,*OUT);
	my($s,$t);

	open(TMP,"$output_name.cvt-tmp") || die $?;
	open(OUT,">$output_name") || die $?;

	print OUT <<"EOF";
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
Do Not Edit! -->
<TITLE>The Debian Bookmark Collection</TITLE>
<H1>The Debian Bookmark Collection</H1>

<DL><p>
EOF

	while (<TMP>) {
		if (/^(\s*)<dl>/i) {
			print OUT $1,"<DL><p>\n";
		}
		if (/^(\s*)<\/dl>/i) {
			print OUT $1,"</DL><p>\n";
		}
		if (/^(\s*)<DT>/i) {
			if (/^(\s*) <DT> \s* <A \s* HREF \s* = \s* (.*)> (.*) <\/A>/ix) {
				print OUT $1,"<DT><A HREF=$2 ".
				      "ADD_DATE=\"0\" LAST_VISIT=\"0\" LAST_MODIFIED=\"0\">$3</A>\n";
			} elsif (/^(\s*) <DT> (.*)/ix) {
				$s=$1; $t=$2; 
				$t =~ s,<.*?>,,ig;
				print OUT $s,"<DT><H3 FOLDED ADD_DATE=\"0\">",$t,"</H3>\n";
			}
		}
		if (/<DD>(.*)/i) {
			print OUT "<DD>$1\n";
		}
	}
	print OUT "</DL><p>\n";

	close(TMP);
	close(OUT);
	$outok=1;
}

sub tmp2html {
	local(*TMP,*OUT);

	open(TMP,"$output_name.cvt-tmp") || die $?;
	open(OUT,">$output_name") || die $?;

	print OUT "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n";
	print OUT "<HTML>\n<BODY>\n\n";

	while (<TMP>) {
		print OUT;
	}

	print OUT "\n</BODY>\n</HTML>\n";
	
	close(TMP);
	close(OUT);
	$outok=1;
}

sub tmp2lynx {
	local(*TMP,*OUT);

	open(TMP,"$output_name.cvt-tmp") || die $?;
	open(OUT,">$output_name") || die $?;

	print OUT "<p>\n<ol>\n";

	while (<TMP>) {
		if (/^\s*<dt>(.*)/) {
			print OUT "<LI>$1\n";
		}
	}
	
	close(TMP);
	close(OUT);
	$outok=1;
}

sub tmp2tmp {
	if (not $inok) {
		die $? if `"cat ",shift," >> $output_name.cvt-tmp"`;
		$inok=1;
	} else {
		die $? if `mv $output_name.cvt-tmp $output_name`;
		$outok=1;
	}
}
################################################ MAIN #######
help if ($#ARGV == 0); 
parse_command_line;

# to prevent tmp-run hacks
$_ = `pwd`; chop;
$output_name = $_ . "/" . $output_name;
`test -e $output_name`; die "file '$output_name' is already existing" if ($?==0);
`test -e $output_name.cvt-tmp`; die "file '$output_name.cvt-tmp' is already existing" if ($?==0);

if ($optctl{'r'}) {
	# the recursive way: fetch all files referenced at SOURCE
	open(TMP,">>$output_name.cvt-tmp") || die $?;
	recursive_html2tmp($input_names[0]);
	close(TMP);
	if (! $inok) {
		print "There is an unexpected error occured.\n";
	}
} else {
	# the normal way: join all SOURCEs
	foreach (@input_names) {
		$inok=0;
		html2tmp($_)	if ($optctl{'i'} eq "html");
		ns12tmp($_) 	if ($optctl{'i'} eq "ns1");
		lynx2tmp($_) 	if ($optctl{'i'} eq "lynx");
		tmp2tmp($_) 	if ($optctl{'i'} eq "tmp");
		if (! $inok) {
			print "This input type is not yet implemented !\n";
			exit 1;
		}
	}
}	

# convert the tmp file in the given format to DEST
$outok=0;
tmp2html 	if ($optctl{'o'} eq "html");
tmp2ns1 	if ($optctl{'o'} eq "ns1");
tmp2lynx	if ($optctl{'o'} eq "lynx");
tmp2tmp		if ($optctl{'o'} eq "tmp");
if (! $outok) {
	print "This output type is not yet implemented !\n";
	exit 1;
}
`rm -f $output_name.cvt-tmp`

# EOT
