#!/usr/local/bin/perl -w
#
# Copyright (c) 1998,1999  David Hiebeler
# For licensing information, see the "printLicense" function
# down around line 58.
#
# File: cedictcheckformat, version 1.1
#   By: David Hiebeler
#       Center for Applied Math
#       Cornell University
#       Ithaca, NY 14853
#       hiebeler@cam.cornell.edu
#       http://www.cam.cornell.edu/hiebeler/home.html
#
#       Version 1.1: June 1999
#       Version 1.0: July 1998
#
# This is a Perl script for checking a cedict-format vocabulary file.
# For now, it just reads the vocabulary list from stdin.
#
# Basically, it checks to see if lines are of the form
# "chinese-characters [pinyin stuff] /English definitions/maybe more defs/",
# as follows:
#
# 1) It catches lines that are missing trailing slashes, the square brackets
#    around the pinyin, etc.
# 2) It checks to see whether every pinyin word consists of lower-case letters
#    and ends with a digit from 1-5.
# 3) If the "-num" flag is specified, it also allows a string of digits at
#    the beginning of the line, indicating difficulty levels for practicing
#    the characters.  However, note that this feature will be discontinued
#    in a future release (unless someone says they really want it).  I now
#    embed the skill-level information that I use inside the English definition
#    field.
# 4) It checks to see whether the number of Chinese characters and number
#    of pinyin words match, if you use the "-checklen" argument.
#
# Sample usage: "checkvocabformat < cedict.gb"
# Run "checkvocabformat -help" for usage info.
#
# Hopefully, Future versions of this script will be better at handling
# punctuation such as parentheses in the Chinese and pinyin fields.
#
# Wish-list:
# * allow square brackets inside the English field
# * allow commas and slashes inside the pinyin field
#
# History:
#   10 Jun 1999: Added code to check whether the number of Chinese characters
#                and number of pinyin words are equal.
#   08 Sep 1998: bug fix to allow a colon after a "u" in the pinyin field
#                (e.g. so pinyin words such as "nu:3" will be recognized,
#		 which is an alternate form of writing "nuu3").
#   29 Jul 1998: original version, 1.0



sub printLicense {
    print <<"END_OF_LICENSE";
cedictcheckformat version 1.1   June 10, 1999
Copyright (C) 1998,1999  David Hiebeler
                         Center for Applied Math
                         Cornell University
                         Ithaca, NY 14853
                         hiebeler\@cam.cornell.edu
                         http://www.cam.cornell.edu/hiebeler/home.html

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

END_OF_LICENSE
}


#
# Read a line, removing comments which begin with "#", and ignoring
# empty lines (or lines which only have a comment).
#
$getlinelinenum = 0;
sub getline {
    if ($#_ == -1) {
	while (<>) {
	    $getlinelinenum++;
	    next if /^\s*#/;
	    next if /^\s*$/;
	    s/#.*$//;
	    chop;
	    return $_;
	}
	return undef;
    }
    elsif ($#_ == 0) {
	$fh = $_[0];
    }
    else {
	die "getlinefp must be called with a single argument";
    }
    while (<$fh>) {
	$getlinelinenum++;
	next if /^\s*#/;
	next if /^\s*$/;
	s/#.*$//;
	chop;
	return $_;
    }
    return undef;
}


#
# Print a usage message and exit.
#
sub printusage
{
    print "Usage: $0 [-strictpy | -nostrictpy] [-strictfmt | -nostrictfmt]\n";
    print "          [-filter | -nofilter] [-checklen | -nochecklen] [-license]\n";
    print " -strictpy: don't allow commas, periods, and parentheses in the pinyin\n";
    print " -strictfmt: be more picky about spaces between Chinese and pinyin, etc.\n";
    print " -filter: run as a filter, output good lines (otherwise, bad lines are\n";
    print "          printed, along with their line numbers)\n";
    print " -checklen: check the lengths of the Chinese and pinyin fields (i.e. check to\n";
    print "          see that the number of Chinese characters and number of pinyin words\n";
    print "          are equal)\n";
    print " -license: Print licensing information\n";
    print "\n\n";
    print "By default, \"strictfmt\" is enabled.\n";
    exit(2);
}


#
# Check the validity of a pinyin phrase.
# Return 1 if the pinyin is bad, return 0 if it is good
#
sub checkpinyin
{
    if ($strictpy == 0) {
	$_[0] =~ s/[\.,\(\)]+/ /g;  # remove periods, commas, and parentheses
    }
    if (!(@words = split(" ", $_[0]))) {
	return 1;
    }
    foreach $word (@words) {
	if (!($word =~ m/^([a-z](u:)?)*[12345]$/)) {
	    return 1;
	}
    }
    return 0;
}
    

##############
# Main program
##############
$donum = 0;
$strictpy = 0;
$strictfmt = 1;
$filter = 0;
$checklen = 0;
while ($thisarg = shift()) {
    if ($thisarg eq "-num") { $donum = 1; }
    elsif ($thisarg eq "-nonum") {$donum = 0;}
    elsif ($thisarg eq "-strictpy") {$strictpy = 1;}
    elsif ($thisarg eq "-nostrictpy") {$strictpy = 0;}
    elsif ($thisarg eq "-strictfmt") {$strictfmt = 1;}
    elsif ($thisarg eq "-nostrictfmt") {$strictfmt = 0;}
    elsif ($thisarg eq "-filter") {$filter = 1;}
    elsif ($thisarg eq "-nofilter") {$filter = 0;}
    elsif ($thisarg eq "-checklen") {$checklen = 1;}
    elsif ($thisarg eq "-nochecklen") {$checklen = 1;}
    elsif ($thisarg eq "-license") { printLicense(); exit(0); }
    else { printusage(); }
}
if ($strictfmt) {
    $chnStr = "[^\\s]+\\s+";
}
else {
    $chnStr = ".*";
}

if ($donum) {
    while ($line=getline()) {
	if (!($line =~ m@^\s*[0-9]+\s+($chnStr)\[(.+)\]\s*/.*/\s*$@)) {
	    if ($filter == 0) {
		print "$getlinelinenum: $line\n";
	    }
	}
	else {
	    $chinese = $1;
	    $pinyin = $2;
	    if (checkpinyin($pinyin)) {
		if ($filter == 0) {
		    print "$getlinelinenum: $line\n";
		}
	    }
	    else {
		if ($filter == 1) {
		    print "$line\n";
		}
	    }
	}
    }
}
else {
    @throwAway = ();
    while ($line=getline()) {
	if (!($line =~ m@^\s*([^\s]+)\s+\[(.+)\]\s*/.*/\s*$@)) {
	    if ($filter == 0) {
		print "$getlinelinenum: $line\n";
	    }
	}
	else {
	    $bad = 0;
	    $chinese = $1;
	    $pinyin = $2;
	    $chnlen = length($chinese);
	    $pylen = scalar(@throwAway = split(" ", $pinyin));
#	    print "chn=`$chinese' ($chnlen), py=`$pinyin' ($pylen)\n";
	    if ($filter == 0) {
		if (checkpinyin($pinyin)) {
		    print "$getlinelinenum: $line\n";
		    $bad = 1;
		}
		if ($checklen) {
		    if ($chnlen != $pylen*2) {
			print "$getlinelinenum: $line\n";
			$bad = 1;
		    }
		}
	    }
	    else {
		if ($bad == 0) {
		    print "$line\n";
		}
	    }
	}
    }
}
