#!/usr/bin/perl -w
# $Id: extract-docs,v 1.22 1998/08/12 01:54:33 gjb Exp $ -*- perl -*-
# extract-docs
# By Greg J. Badros -- 7-July-1998
#
# extract-docs pulls out the SCWM_PROC declarations and comments
# from a SCWM .c source file and processes them to create
# documentation
#
# Usage: extract-docs <filenames-to-extract-from>
#
# e.g.,
#
# extract-docs *.c
#
# Purpose:
# Extract documentation from comments in C source files
# and generate a plaintext listing of the procedures, and
# DocBook SGML output to create parts of the SCWM manual.
#
# Note that this script does lots of important error checking, and
# produces error and warning messages that look like grep, so emacs'
# compile-mode (and probably grep-mode) can be used to find problems
# with the documentation using M-x compile (C-x c) extract-docs *.c >
# /dev/null then M-x next-error (C-x `)
#
# Usage:
#  from ~lgjb/scwm
# ./utilities/dev/extract-docs -o doc/scwm scwm/*.c > doc/scwm-procedures.txt
# or from Emacs, for warnings only
# ./utilities/dev/extract-docs scwm/*.c > /dev/null
#
# BUGS:
# This should probably be written in guile-scheme instead of perl
# (perhaps the guile folks will do this if they want to use a similar
#  extraction system for guile-scheme)
#

require 5.004;  # uses "for my $var"
use strict;
use constant TRUE => (1==1);
use constant FALSE => (1==0);
use File::Basename;

my $getopts_option_letters = 'hqQDo:s';
use vars qw($opt_h $opt_q $opt_Q $opt_D $opt_o $opt_s);

sub script_usage ( ) {
  print "@_\nUsage: $0 [-$getopts_option_letters]
-q       Be reasonably quiet-- do not warn about spacing or purpose strings
-Q       Be completely QUIET-- no warnings (still prints errors)
-D       Debugging output on
-o file  Send sgml output to file -- no sgml output unless this is given
-s       Run ispell on the comments and reports warnings for its responses
";
  exit 0;
}


my $pkg_name = basename($ENV{PWD});
my $fDebug = FALSE;
my $fQuiet = FALSE;
my $fReallyQuiet = FALSE;


use Getopt::Std;
getopts($getopts_option_letters);

script_usage() if ($opt_h);

$fDebug = TRUE if $opt_D;
$fQuiet = TRUE if $opt_q || $opt_Q;
$fReallyQuiet = TRUE if $opt_Q;

# maps from a procedure name to a hash
# containing "usage", "purpose", "comment",  "markup", "file", "line"
my %procedure = ();

# maps from a filename to a list reference containing the names of 
# primitives defined in that file
my %file_funcs = ();

# Maps from concepts/hooks/vars to a hash containing
# "comment", "markup", "file", "line"
my %concepts = ();
my %hooks = ();
my %vars = ();


use FileHandle;
use IPC::Open2;

# dictionary of scwm-specific words, in the same directory as this script
(my $dictionary = $0)=~s#[^/]*$#dictionary#;

my $pid = open2( \*ISPELL_RESPONSE,
		 \*ISPELL, "ispell -a -p $dictionary") or die "Could not open \'ispell -a\' pipe: $!";
ISPELL->autoflush();

my $header = "";
while (<>) {
  close(ARGV) if (eof);
  if (m/^SCWM_PROC/) {
    $header = $_;
    my $filename = $ARGV;
    my $line = $.;
    while (($_ = <>) !~ m/^\s*\{/) {
      $header .= $_;
    }
    $header .= $_;
    ProcessHeader($filename, $line, $header);
    $header = "";
  } elsif (m%/\*\*\s*(\w[^:]*):\s*(.*?)\s*$%) {
    my $type = $1;
    my $description = $2;
    if ($type eq "" || $description eq "") {
      print STDERR "$ARGV:$.:**** Improper /**-style comment: got type = \`$type\', description = \`$description\'.\n";
      next;
    }
    if (uc($type) eq "HOOK") {
      my $filename = $ARGV;
      my $line = $.;
      my $body = ReadRestOfComment("");
      ProcessHookComment($filename, $line, $description,$body);
    } elsif (uc($type) eq "CONCEPT") {
      my $filename = $ARGV;
      my $line = $.;
      my $body = ReadRestOfComment("");
      ProcessConceptComment($filename, $line, $description,$body);
    } elsif (uc($type) eq "VAR") {
      my $filename = $ARGV;
      my $line = $.;
      my $body = ReadRestOfComment("");
      ProcessVarComment($filename, $line, $description, $body);
    } else {
      print STDERR "$ARGV:$.:**** Unrecognized type for /**-style comment = \`$type\'\n";
      next;
    }
  }
}

my $sgml_name = ""; # "$pkg_name.sgml";
if ($opt_o) {
  $sgml_name = $opt_o;
  $sgml_name .= ".sgml" if ($sgml_name !~ /\..+ml$/);

  open (MARKUP_OUT,">$sgml_name") or die "Could not write to $sgml_name: $!";
  chop (my $date = `date +"%d %B %Y"`);
  chop (my $year = `date +"%Y"`);
  
  print MARKUP_OUT <<END_HEADER
<!DOCTYPE Book PUBLIC "-//Davenport//DTD DocBook V3.0//EN">
<book>
  <bookinfo>
    <title>
      <productname>SCWM Reference Manual</productname>
    </title>
    <authorgroup>
      <author>
  	<firstname>Maciej</firstname>
  	<surname>Stachowiak</surname>
  	<affiliation>
  	  <shortaffil>MIT</shortaffil>
  	  <jobtitle>M.S. Degree Recipient</jobtitle>
  	  <orgname>Massachusetts Institute of Technology</orgname>
  	  <orgdiv>Department of Computer Science</orgdiv>
  	  <address>
  	    <city>Cambridge</city>
  	    <state>Massachusetts</state>
  	    <postcode>12345</postcode>
  	    <country>U.S.A.</country>
  	    <email>mstachow\@mit.edu</email>
  	  </address>
  	</affiliation>
      </author>
      <author>
  	<firstname>Greg</firstname>
  	<surname>Badros</surname>
  	<affiliation>
  	  <shortaffil>UWashington</shortaffil>
  	  <jobtitle>Graduate Research Assistant</jobtitle>
  	  <orgname>University of Washington</orgname>
  	  <orgdiv>Department of Computer Science and Engineering</orgdiv>
  	  <address>
  	    <city>Seattle</city>
  	    <state>Washington</state>
  	    <postcode>98195</postcode>
  	    <country>U.S.A.</country>
  	    <email>gjb\@cs.washington.edu</email>
  	  </address>
  	</affiliation>
      </author>
    </authorgroup>
    <releaseinfo>Release pre-0.8</releaseinfo>
    <pubdate>$date</pubdate>
    <copyright>
      <year>1997&ndash;$year</year>
      <holder>Maciej Stachowiak and Greg J. Badros</holder>
    </copyright>
  </bookinfo>
  <chapter>
    <title>Primitives in Alphabetical Order</title>
END_HEADER
  ;
}


# This outputs the scwm-procedures.txt file to stdout
foreach my $proc (sort { lc($a) cmp lc($b) } keys %procedure) {
  my $usage = $procedure{$proc}{usage};
  my $comment = $procedure{$proc}{comment};
  my $file = $procedure{$proc}{file};
  my $line = $procedure{$proc}{line};
  my $markup = $procedure{$proc}{markup};
  print <<EOC
$usage
$comment
[From $file:$line]


EOC
  ;
  if ($opt_o) {
    print MARKUP_OUT $markup, "\n";
  }
}

# End the procedures chapter
if ($opt_o) {
  print MARKUP_OUT "  </chapter>\n";
}


# Now output primitives by defined-in file
if ($opt_o) {
  print MARKUP_OUT <<START_PROC_BY_FILE
  <chapter>
    <title>Primitives By Defined-in File</title>
START_PROC_BY_FILE
  ;
  
  foreach my $file (sort { lc($a) cmp lc($b) } keys %file_funcs) {
    my @prims = @{$file_funcs{$file}};
    if (scalar(@prims) > 0) {
      print MARKUP_OUT "    <sect1><title>$file</title> <itemizedlist>\n";
      foreach my $proc (sort { $procedure{$a}{line} <=> $procedure{$b}{line} } @prims ) {
	my $markup = $procedure{$proc}{markup};
	my $target = $procedure{$proc}{sgml_id};
	my $primname = $procedure{$proc}{primname};
	my $purpose = $procedure{$proc}{purpose};
	my $markup_purpose = $procedure{$proc}{markup_purpose};
	print MARKUP_OUT "      <listitem><para><link linkend=\"$target\"><function>$proc</function></link> &mdash; $markup_purpose</para></listitem>\n";
      }
      print MARKUP_OUT "    </itemizedlist> </sect1>\n";
    }
  }
  print MARKUP_OUT "  </chapter>\n";
}


# Now output hooks chapter
if ($opt_o) {
  print MARKUP_OUT <<START_HOOKS_CHAPTER
  <chapter>
    <title>Hooks</title>
START_HOOKS_CHAPTER
  ;
  foreach my $hook (sort { lc($a) cmp lc($b) } keys %hooks ) {
    my $markup = $hooks{$hook}{markup};
    print MARKUP_OUT "    <sect1 id=\"$hook\"><title>$hook</title><para>\n$markup
    </para></sect1>\n";
  }
  print MARKUP_OUT "  </chapter>\n";
}


# Now output vars chapter
if ($opt_o) {
  print MARKUP_OUT <<START_VARS_CHAPTER
  <chapter>
    <title>User variables</title>
START_VARS_CHAPTER
  ;
  foreach my $var (sort { lc($a) cmp lc($b) } keys %vars ) {
    my $markup = $vars{$var}{markup};
    print MARKUP_OUT "    <sect1 id=\"$var\"><title>$var</title><para>\n$markup
    </para></sect1>\n";
  }
  print MARKUP_OUT "  </chapter>\n";
}


# Now output concepts chapter
if ($opt_o) {
  print MARKUP_OUT <<START_CONCEPTS_CHAPTER
  <chapter>
    <title>Concepts</title>
START_CONCEPTS_CHAPTER
   ;
  foreach my $concept (sort { lc($a) cmp lc($b) } keys %concepts ) {
    my $markup = $concepts{$concept}{markup};
    my $concept_id = $concept;
    $concept_id =~ tr/ _/--/;
    print MARKUP_OUT "    <sect1 id=\"$concept_id\"><title>$concept</title><para>\n$markup
    </para></sect1>\n";
  }

  print MARKUP_OUT "  </chapter>\n";
}




# Now output sgml trailer
if ($opt_o) {
  print MARKUP_OUT <<END_TRAILER
</book>
<!-- Keep this comment at the end of the file
Local variables:
mode: sgml
fill-column: 10000
sgml-omittag:nil
sgml-shorttag:t
End:
-->
END_TRAILER
  ;
}


sub ProcessHeader( $ ) {
  my ($filename, $line, $header) = @_;
  my ($cprimname, $primname, $req, $opt, $var, $argslist) =
    $header =~ m%^SCWM_PROC\s*\(\s*([^, \t]*),\s*\"([^\"]*)\"\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*\(([^\)]*)\)%;
  if (!defined($cprimname)) {
    print STDERR "$filename:$.:****ERROR:could not parse argument list\n";
    return FALSE;
  }
  my ($comment) = $header =~ m%/\*\*\s*(.*)\s*\*/%s;
  my ($fname_define) = $header =~ m%^\s*\#\s*define\s+FUNC_NAME\s+(.*?)\s*$%m;
  
  my $clean_argslist = $argslist;
  my $cremovals = ($clean_argslist =~ s/\bSCM\b//g);
  $clean_argslist =~ s/\n/ /g;
  $clean_argslist =~ s/[ \t]+//g;
  $clean_argslist =~ s/_[pP]\b/?/g;
  $clean_argslist =~ s/_[xX]\b/!/g;
  $clean_argslist =~ s/_/-/g;
  my @args = split(/,/, $clean_argslist);

  # now convert the c function name into the expected (preferred) primitive name:
  my $expected_primname = $cprimname;
  $expected_primname =~ s/_[pP]\b/?/g;
  $expected_primname =~ s/_[xX]\b/!/g;
  $expected_primname =~ s/_/-/g;
  # alternative possibility (ignoring chance of multiple to's in string)
  my $expected_primname2 = $expected_primname;
  $expected_primname2 =~ s/-to-/->/g;

  
  # now create a hash of the names for testing words in the comment
  # whether they are referring to formal parameters
  my %argnames = map { uc($_) => 1} @args;

  if ($primname ne $expected_primname && $primname ne $expected_primname2) {
    if (!$fReallyQuiet) {
      print STDERR "$filename:$line:****$cprimname: expected scheme primitive named \`$expected_primname\'\n";
    }
  }

  if ($cremovals != scalar(@args) ) {
    print STDERR "$filename:$line:****$cprimname: types inconsistency (all args should be type SCM)\n";
  }

  if (($req + $opt + $var) != scalar(@args)  ) {
    print STDERR "$filename:$line:****$cprimname: argument inconsistency -- check #s of arguments\n";
  }

  if ($var != 0 && $var != 1) {
    print STDERR "$filename:$line:****$cprimname: number of variable arguments == $var -- why?\n";
  }

  if (!defined($comment) || $comment eq "") {
    if (!$fReallyQuiet) {
      print STDERR "$filename:$line:****$cprimname: comment missing\n";
    }
    $comment = "";
  }

  if (!defined($fname_define)) {
    print STDERR "$filename:$line:****$cprimname: \`#define FUNC_NAME s_$cprimname\' is missing\n";
  } elsif ($fname_define ne "s_".$cprimname) {
    print STDERR "$filename:$line:****$cprimname: \`#define FUNC_NAME s_$cprimname\' does not match function name \`$fname_define\'\n";
  }

  my @required_args = @args[0..($req-1)];
  my @optional_args = @args[$req..($req + $opt - 1)];
  my @var_args = @args[($req+$opt)..($req+$opt+$var-1)];

  my $arg_listing = "";
  if ($#args >= 0) {
    $arg_listing .= "@required_args";
    if ($#optional_args >= 0) {
      $arg_listing .= " #&optional @optional_args";
    }
    if ($#var_args >= 0) {
      $arg_listing .= " . @var_args";
    }
  }

  my $usage = sprintf "($primname%s$arg_listing)", ($arg_listing ne ""? " ":"");

  my %upcase_words = ();

  # check to make sure all all-uppercase words in the comment
  # refer to formals
  foreach my $word (split /[^-+%?\$!\w_\"]+/, $comment) {
    if ($word =~ /^[A-Z][-%A-Z0-9_+?!]+$/) {
      if (!defined($argnames{$word})) {
	if (!$fReallyQuiet) {
	  print STDERR "$filename:$line:****$cprimname: all-uppercase word \`$word\' does not match an argument\n";
	}
      }
    }
  }

  # check to make sure all formals are referred to in the comment
  foreach my $formal (keys %argnames) {
    if ($comment !~ /\Q$formal\E/) {
      if (!$fReallyQuiet) {
	print STDERR "$filename:$line:****$cprimname: formal $formal not mentioned in comment\n";
      }
    }
  }

  my ($purpose) = $comment =~ m%(.*?[\.;\n])%;
  if (!defined $purpose) {
    # did not match, so must have been a one-liner w/o a newline
    $purpose = $comment;
  }
  chomp ($purpose); # in case it matched the newline
  if (!defined($purpose) || $purpose !~ /\.\s*$/) {
    if (!$fQuiet) {
      print STDERR "$filename:$line:****$cprimname: first line of comment should be a purpose sentence\n";
    }
  }

  # Clean up spacing in $comment -- use \n instead of $ since
  # the latter matches before the new line
  $comment =~ s/^\s*\n//mg;
  # delete whitespace-only lines in old-comment, so we don't get a warning
  # on them -- they're useful to avoid Emacs's reindent-paragraph
  # from causing the synopsis sentence to have extra words tacked onto the end

  my $old_comment = $comment;

  $comment =~ s/^\s+//mg;
  if ($comment ne $old_comment) {
    if (!$fQuiet) {
      print STDERR "$filename:$line:****$cprimname: leading spaces (indentation) is being omitted\n";
    }
  }

  # Clean up trailing space, but don't warn about it
  $comment =~ s/\s+$//mg;

  IspellText($filename,$line,$comment) if $opt_s;

  my $sgml_id = $cprimname;
  $sgml_id =~ s/_/-/g;

  # Now want to do the markup of $comment, and set $markup_comment
  # FIXGJB: fold into testing, above
  my $markup_comment = MarkupComment($comment);

  # Mark formals within comment with <parameter> tag
  # must sort by length so longer formals get replaced first
  # note that it is essential to convert to lowercase as we
  # go, otherwise shorter substitutions will be made inside
  # an already-substitued <param> </param> pair
  foreach my $formal (sort { length($b) <=> length($a) } keys %argnames) {
    $markup_comment =~ s%(\Q$formal\E)%<parameter>\L$1\E</parameter>%g;
  }

  # Just do simple markup of the usage
  my $markup_usage = $usage;
  $markup_usage =~ s%(\s+)&(\s+)%$1&amp;$2%g;
  $markup_usage =~ s%(\s+)<(\s+)%$1&lt;$2%g;
  $markup_usage =~ s%(\s+)>(\s+)%$1&gt;$2%g;

  my $markup_purpose = MarkupComment($purpose);

  # FIXGJB: use some sgml perl library!

  # Use <refentry>, <refname>, <refpurpose>, <synopsis>
  
  # Filename url links rely on environment variable SCWMDIR being
  # set to the base of the scwm distribution
  # i.e. $SCWMDIR/scwm/scwm.c should contain main()
  my $markup = 
"<refentry id=\"$sgml_id\">
  <refnamediv>
    <refname>$primname</refname>
    <refpurpose>$markup_purpose</refpurpose>
  </refnamediv>
  <refsynopsisdiv>
    <synopsis>$markup_usage</synopsis>
  </refsynopsisdiv>
  <refsect1>
  <title>Description</title>
  <para>
  $markup_comment
  </para>
  </refsect1>
  <refsect1>
  <title>Implementation Notes</title>
  <para> Defined in <ulink url=\"file://@{[$ENV{'SCWMDIR'}]}/$filename\"><filename>$filename</filename></ulink>
  at line $line </para> </refsect1>
</refentry>
";


  if ($fDebug) {
    print STDERR <<EOC
scheme-primitive-name:	$primname
C-primitive-name:	$cprimname
arg kinds:		$req, $opt, $var
args:			@{[join(",",@args)]}
arg listing:		$arg_listing
fname define:           $fname_define
EOC
;
  }
  $procedure{$primname} = { usage => $usage,
			    comment => $comment,
			    purpose => $purpose,
			    sgml_id => $sgml_id,
			    markup_purpose => $markup_purpose,
			    markup_usage => $markup_usage,
			    markup => $markup,
			    file => $filename,
			    line => $line,
			  };
  push @{$file_funcs{$filename}}, $primname;

  return TRUE;
}

# remember, no underscores in sgml ids
sub ScmIdToSgmlId ( $ ) {
  my ($id) = @_;
  $id =~ s/\?$/-p/;
  $id =~ s/!$/-x/;
  $id =~ s/->/-to-/g;
  $id =~ s/_/-/;
  return $id;
}

sub ReadRestOfComment ( $ ) {
  my ($comment) = @_;
  while (($_ = <>) !~ m%\*/%) {
    $comment .= $_;
  }
  $comment .= $_;
  return $comment;
}

sub ProcessConceptComment ( $$$$ ) {
  my ($filename,$line,$description,$comment) = @_;
  $comment =~ s%\*/\s*$%%m;
  if ($fDebug) {
    print STDERR "Concept \`$description\' with body = \n$comment\n";
  }

  IspellText($filename,$line,$comment) if $opt_s;

  my $markup = MarkupComment($comment);

  $concepts{$description} = { comment => $comment,
			      markup => $markup,
			      file => $filename,
			      line => $line,
			    };
}

sub ProcessHookComment ( $$$$ ) {
  my ($filename,$line,$description,$comment) = @_;
  $comment =~ s%\*/\s*$%%m;
  if ($fDebug) {
    print STDERR "Hook \`$description\' with body = \n$comment\n";
  }

  IspellText($filename,$line,$comment) if $opt_s;

  my $markup = MarkupComment($comment);
  $hooks{$description} = { comment => $comment,
			   markup => $markup,
			   file => $filename,
			   line => $line,
			 };
}

sub ProcessVarComment ( $$$$ ) {
  my ($filename,$line,$description,$comment) = @_;
  $comment =~ s%\*/\s*$%%m;
  if ($fDebug) {
    print STDERR "Var \`$description\' with body = \n$comment\n";
  }

  IspellText($filename,$line,$comment) if $opt_s;

  my $markup = MarkupComment($comment);
  $vars{$description} = { comment => $comment,
			   markup => $markup,
			   file => $filename,
			   line => $line,
			 };
}


# FIXGJB
sub MarkupComment( $ ) {
  my ($body) = @_;

  # convert & into &amp; space-delimited <, > into &lt; and &gt;
  $body =~ s%&%&amp;%g;
  $body =~ s%(\s+)<(\s+)%$1&lt;$2%g;
  $body =~ s%(\s+)>(\s+)%$1&gt;$2%g;

  # Mark #t and #f within comment with <literal> tag
  $body =~ s%(\#[tf])%<literal>$1</literal>%g;

  # Replace `procedure' with <function>procedure</function>
  $body =~ s%\`([A-Za-z][-A-Za-z0-9_?!+]*?)\'%
    "<link linkend=\"" . ScmIdToSgmlId($1) . "\"><function>$1</function></link>"%eg;

  return $body;
}

sub IspellText( $$$ ) {
  my ($filename,$line,$text,$response) = @_;
  foreach my $word (split /[\d\W]+/, $text) {
    # ispell is picky about lots of stuff, so ignore them
    next if $word =~ /^[-\#]/;
    next if $word !~ /^\w\w+/;
    next if $word eq uc($word);
    print STDERR "ispell trying $word -> " if $fDebug;
    my $junk = <ISPELL_RESPONSE>; # read the blank
    print ISPELL $word, "\n";
    chomp (my $response = <ISPELL_RESPONSE>);
    print STDERR "response = \`$response\'\n" if $fDebug;
    if ($response eq "") {
      print STDERR "$filename:$line:****ISPELL is out of sync (last word \`$word\') -- aborting its use!\n";
      $opt_s = FALSE;
      last;
    }
    if ($response !~ m/^[+\*]/) {
      print STDERR "$filename:$line:****ispell reported possible misspelling: $word -> $response\n";
      print STDERR "Should I add `$word' to my list of known correct words? ";
      $response=<STDIN>;
      if ($response=~/^y(es)?$/i) {
	print ISPELL "*",lc($word),"\n";
      }
    }
  }
}
