#!/usr/local/bin/perl -w
#
# Copyright (c) 1998  David Hiebeler
# For licensing information, see the "printLicense" function
# down around line 70.
#
#
# File: cedictsort, version 1.1
#   By: David Hiebeler
#       Center for Applied Math
#       Cornell University
#       Ithaca, NY 14853
#       hiebeler@cam.cornell.edu
#       http://www.cam.cornell.edu/hiebeler/home.html
#
#       Version 1.1: December 1998
#       Version 1.0: July 1998
#
# This is a Perl script for sorting a CEDICT-format file (see
# "http://www.mindspring.com/~paul_denisowski/cedict.html" for
# information about CEDICT), based on pinyin romanization of the
# pronunciation.  It tries to sort in the standard order that you
# see in most Chinese (hardcopy) dictionaries.
#
# Usage: cedictsort file1 [file2 file3 ...]
# You can sort stdin, by using "-" as a file name, e.g.
#     cat myfile.gb | cedictsort -
#
# Use can use the "-uu2u:" command-line argument to turn pinyin entries
# like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite,
# i.e. turn "nu:3" into "nuu3".  (This feature is available because
# both forms have appeared in various versions of CEDICT).
#
# This script should work correctly on both GB and BIG5 files.
#
# Note that this script will exit if it encounters any lines not
# in cedict format, with the following exception: it will ignore (and
# discard) any blank lines, and discard any comments which begin
# with '#' (whether the comment is the only thing on a line, or at the
# end of a line).  You should use the "cedictcheckformat" script
# to catch any lines in your vocabulary file which are not in strict
# CEDICT format, before trying to sort.
#
# History:
#   10 Dec 1998: added code to turn "uu" into "u:" or vice-versa in the
#      pinyin field if the user requests it, to handle the fact that both
#      forms were present in cedict for some time, or may be present in
#      various Chinese documents you encounter.
#   31 July 1998: added code to print out "levels" info for each entry,
#      if present (the code previously could read files with levels info,
#      but discarded the information when outputting sorted results)
#   29 July 1998: original version, 1.0
#        


# The two Chinese characters "bu4" and "yi1" will be sorted by their
# "standalone" pinyin pronunciation, even though they actually change their
# tones depending on the character following them.
$bu4gb = "";
$yi1gb = "һ";
$bu4big5 = "";
$yi1big5 = "@";


# Define a couple of constants
$uu2uc = 1;
$uc2uu = 2;


sub printLicense {
    print <<"END_OF_LICENSE";
cedictsort version 1.1   June 10, 1999
Copyright (C) 1998,1999  David Hiebeler
                         Center for Applied Math
                         Cornell University
                         Ithaca, NY 14853
                         hiebeler\@cam.cornell.edu
                         http://www.cam.cornell.edu/hiebeler/home.html

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

END_OF_LICENSE
}


#
# Print a usage message and exit.
#
sub printusage {
    print "Usage: $0 [-uu2u: | -u:2uu] file1 [file2 file3 ...]\n";
    print "You can sort standard input, by using '-' as a file name, e.g.\n";
    print "    cat myfile.gb | cedictsort -\n";
    print " -uu2u: Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n";
    print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n";
    exit 2;
}


#
# Read a line from a file or stdin, removing comments which begin with "#",
# and ignoring empty lines (or lines which only have a comment)
#
sub getline {
    if ($#_ == -1) {
	while (<>) {
	    next if /^\s*#/;
	    next if /^\s*$/;
	    s/#.*$//;
	    chop;
	    return $_;
	}
	return undef;
    }
    elsif ($#_ == 0) {
	$fh = $_[0];
    }
    else {
	die "getlinefp must be called with a single argument or no arguments";
    }
    while (<$fh>) {
	next if /^\s*#/;
	next if /^\s*$/;
	s/#.*$//;
	chop;
	return $_;
    }
    return undef;
}


#
# read in a vocabulary file
#
sub readvocabfile {
    my $levels;
    my $chinese;
    my $english;
    my $pinyin;
    my $chineseLength;
    my $i = 0;
    my $tmpChineseChar;
    my $tmpChinese;

    open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'";
    while ($line=getline("INFILE")) {
	# handle case where line has skill level(s) at beginning
	if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	    $levels .= " ";
	}
	# line doesn't have skill level numbers at beginning
	elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($chinese,$pinyin,$english) = ($1,$2,$3);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	    $levels = "";
	}
	else {
	    $line =~ s/[\n\r]//;
	    print "Invalid line: `$line'\n";
	    die "Invalid line encountered";
	}

	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert == $uu2uc) {
	    $pinyin =~ s/uu/u:/;
	}
	elsif ($uConvert == $uc2uu) {
	    $pinyin =~ s/u:/uu/;
	}

	# we associate a unique number (kept track of by the variable
	# $chineseCharsHashCounter) with every Chinese character which appears
	# in an entry, to use for sorting later
	$tmpChinese = $chinese;
	$tmpChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers
	$chineseLength = length($tmpChinese) / 2;
	for ($i=0; $i < $chineseLength; $i++) {
	    $tmpChineseChar = substr($tmpChinese, $i*2, 2);
	    if (! defined($chineseChars{$tmpChineseChar})) {
		$chineseChars{"$tmpChineseChar"} = ++$chineseCharsHashCounter;
	    }
	}

	# now put everything into the main array of hashes
	$wordList[$vocabIndex]->{"levels"} = "$levels";
	$wordList[$vocabIndex]->{"chinese"} = $chinese;
	$wordList[$vocabIndex]->{"english"} = $english;
	$wordList[$vocabIndex]->{"pinyin"} = $pinyin;
	# used for extreme tie-breaking when sorting
	$wordList[$vocabIndex]->{"index"} = $vocabIndex;
	$vocabIndex++;
    }
    close INFILE;
}


#
# Here is the heart of this Perl script.  This is the function which
# is used (via the Perl "sort" routine) to sort everything.
#
sub byRealPinyinAndChinese {

    # if we just sort by the pinyin (or rather the adjusted "realpinyin"
    # which handles the special characters "bu4" and "yi1"), it won't properly
    # group characters with the same pronunciation (i.e. it may group entries
    # with different characters with the same sound, and split up entries
    # with the same character).  So we have to go through the realpinyin a
    # word (or rather, a Chinese character) at a time, and if the sounds are
    # the same, check the actual Chinese characters.  If the Chinese
    # characters are the same too, go to the next pinyin sound (and character,
    # if necessary), until a difference is found or we fall off the end of
    # one of the strings.  If we fall off the end of a string, the shorter
    # string is considered to come "first".

    # first some local variables to use
    my ($aLength, $bLength, $minLength, $aChinese, $bChinese);
    my ($aChineseChar,$bChineseChar, $aCharInd, $bCharInd, @aPinyin, @bPinyin);

    $aChinese = $a->{chinese};
    $bChinese = $b->{chinese};
    $aChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers
    $bChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers
    $aLength = length($aChinese) / 2;
    $bLength = length($bChinese) / 2;
    $minLength = ($aLength < $bLength) ? $aLength : $bLength;
    # split pinyin into separate words so we can go through one at a time
    @aPinyin = split(' ', $a->{realpinyin});
    @bPinyin = split(' ', $b->{realpinyin});

    for ($i=0; $i < $minLength; $i++) {
	# if for some reason one of the pinyin fields is shorter than we
	# expect, just let that one "win" (come first).
	if (!defined($aPinyin[$i])) { return -1; }
	if (!defined($bPinyin[$i])) { return 1; }
	if ($aPinyin[$i] lt $bPinyin[$i]) { return -1; }
	elsif ($bPinyin[$i] lt $aPinyin[$i]) { return 1; }
	else {   # shoot, the pinyin is the same, have to look at characters
	    $aChineseChar = substr($aChinese, $i*2, 2);
	    $aCharInd = $chineseChars{$aChineseChar};
	    $bChineseChar = substr($bChinese, $i*2, 2);
	    $bCharInd = $chineseChars{$bChineseChar};
	    if ($aCharInd < $bCharInd) { return -1; }
	    elsif ($bCharInd < $aCharInd) { return 1; }
	    # if we get here, then the chinese characters were the same
	    # too, so we just continue the loop and try the next char
	}
    }
    # if we get here, the strings were the same, up to the length of the
    # shorter one.  The shorter one will be considered "first".
    if ($aLength < $bLength) { return -1; }
    elsif ($bLength < $aLength) { return 1; }
    else { # strings are totally the same; use their indices as tie-breakers
	if ($a->{"index"} < $b->{"index"}) { return -1; }
	elsif ($b->{"index"} < $a->{"index"}) { return 1; }
	else { die "That's funny, I really shouldn't be able to die this way while sorting"; }
    }

# This was the old simple way, which I hoped would work, but didn't, because
# shorter strings would win over longer ones, so all the short strings
# beginning with different "bu4" characters would come before all the longer
# strings beginning with the different "bu4" characters, etc.
#
#    $a->{"realpinyin"} cmp $b->{"realpinyin"}
#    or
#	$a->{"chinese"} cmp $b->{"chinese"}
#    or
#	$a cmp $b;
}


#
# This routine takes the pinyin of a given vocabulary entry, and modifies
# it (or rather, a copy of it) slightly.  If it sees the Chinese character
# "bu4" (meaning "not/no"), then it makes sure the character is labelled
# as "bu4" in pinyin, rather than "bu2" as it is pronounced when it comes
# before another 4th-tone character.  Similarly, it makes sure "yi1" (one) 
# is always labelled as "yi1" in pinyin, rather than "yi2" or "yi4" as it
# can also be pronounced.  This is so entries containing these characters
# will be sorted by the Chinese characters with their original pronunciation,
# as real Chinese dictionaries do (or at least the ones I've seen).
sub buildRealPinyin {
    # local vars
    my ($tmpChinese, $chineseLength);

    foreach $word (@wordList) {
	$tmpChinese = $word->{"chinese"};
	$tmpChinese =~ s/[\.,\(\)\w]+/ /g; # strip out periods, commas, parens, letters, and numbers
	$chineseLength = length($tmpChinese) / 2;
	$word->{"realpinyin"} = $word->{"pinyin"};
	$word->{"realpinyin"} =~ s/[\.,\(\)]+/ /g;  # strip out periods, commas, and parentheses
	# separate the pinyin into words
	@pinyinWords = split(" ", $word->{"realpinyin"});

	# now go through one pinyin word at a time, and adjust if necessary
	for ($i=0; $i < $chineseLength; $i++) {
	    $tmpStr = substr($tmpChinese, $i*2, 2);
	    if (($tmpStr eq $bu4gb) || ($tmpStr eq $bu4big5)) {
		$pinyinWords[$i] = "bu4";
	    }
	    elsif (($tmpStr eq $yi1gb) || ($tmpStr eq $yi1big5)) {
		$pinyinWords[$i] = "yi1";
	    }
	}
	# put the words back together
	$word->{"realpinyin"} = join(' ', @pinyinWords);
    }
}


#
# Print out the vocabulary list
#
sub printVocab {
    foreach $word (@wordList) {
	print "$word->{levels}",
	"$word->{chinese} [$word->{pinyin}] $word->{english}\n";
    }
}


#
# I just used this routine for debugging when writing the script...
#
sub printHash {
    foreach $key (keys %chineseChars) {
	print "$key -> $chineseChars{$key}\n";
    }
}

##############
# Main program
##############

# Gee, the main program is pretty simple after all the above stuff.

print "hi there\n";
$uConvert = 0;
while ($thisarg = shift()) {
    if ($thisarg eq "-uu2u:") {
	$uConvert = $uu2uc;
    }
    elsif ($thisarg eq "-u:2uu") {
	$uConvert = $uc2uu;
    }
    elsif ($thisarg eq "-license") { printLicense(); exit(0); }
    else {  # put back this arg, it is supposed to be a filename
	unshift(@ARGV, $thisarg);
	last;
    }
}

$chineseCharsHashCounter = 0; # used to give every Chinese char a unique number
%chineseChars = (); # hash which will hold the unique numbers for Chinese chars

if ($#{ARGV} < 0) { printusage; }
$vocabIndex = 0;

# read in the file(s)
foreach $fileName ( @ARGV ) {
    readvocabfile($fileName);
}
buildRealPinyin();

# the next 4 lines were only here for debugging...
#printVocab();
#print "================\n";
#printHash();
#print "================\n";

@wordList = sort byRealPinyinAndChinese @wordList;
printVocab();
