: Use /bin/sh # # $Id: munchlist.X,v 1.38 1992/01/04 22:08:18 geoff Exp $ # # Copyright 1987, 1988, 1989, by Geoff Kuenning, Manhattan Beach, CA # Permission for non-profit use is hereby granted. # All other rights reserved. # See "version.h" for a more complete copyright notice. # # $Log: munchlist.X,v $ # Revision 1.38 1992/01/04 22:08:18 geoff # Don't use the non-portable "-x" test; use "-r" as a substitute. Fix a # misplaced switch in a sort command. # # Revision 1.37 91/09/11 23:22:53 geoff # Add the "-u" flag to the sort of CROSSEXPANDED. Add a PAIRUP step to # the calculation of MINIMALAFFIXES to protect against affix files that # generate so many options that they break limited versions of awk. # # Revision 1.36 91/07/27 20:48:33 geoff # Make the default language tables be configurable from the Makefile, # the way they should have been all along. # # Revision 1.35 91/07/04 00:04:22 geoff # Add support for the -T switch (passed to icombine). # # Revision 1.34 91/07/03 18:21:04 geoff # Don't use the ":-" notation in defining TMPDIR, since some # braindamaged Bourne shells don't handle it. # # Revision 1.33 91/06/23 22:14:51 geoff # Fix a typo that was a syntax error # # Revision 1.32 91/01/27 00:43:36 geoff # Add the "-u" switch to the final sort to make absolutely sure that there # are no duplicates in the output file. # # Revision 1.31 90/09/05 02:35:38 geoff # Before ln'ing things, rm -f them to handle BSD systems in which ln correctly # refuses to make links to existing files. # # Revision 1.30 89/12/27 22:30:48 geoff # Be sure to specify -W0 when crunching the word list. # # Revision 1.29 89/04/27 23:33:13 geoff # Add support for the selectable flag marker character. # # Revision 1.28 89/02/18 00:52:32 geoff # Add another icombine step to make sure that unnecessary capitalizations # are dropped. # # Revision 1.27 88/12/26 02:31:35 geoff # Update the copyright notice. # # Revision 1.26 88/11/25 19:53:09 geoff # Add the -c (convert old language table) option. # # Revision 1.25 88/11/16 02:20:09 geoff # Make the default "wchars" argument be "-wA" so that ispell won't get # a null argument. Make sure the fake hash file ends in ".hash". # # Revision 1.24 88/04/11 01:37:47 geoff # Accept "-" to indicate standard input, and "--" to indicate the end of # the options. # # Revision 1.23 88/04/03 23:11:49 geoff # Fix the -w option so it doesn't require quoting any more, and so it # doesn't unintentionally add a blank to the -w list. # # Revision 1.22 88/03/30 00:14:11 geoff # Replace two sorts with uniqs for speed. # # Revision 1.21 88/03/29 00:29:43 geoff # Return to using icombine rather than buildhash to combine suffixes. # Also, be sure to include the suffixes in the sort -u steps so we don't get # stuck with one suffix per root. # # Revision 1.20 88/03/27 01:03:22 geoff # Make the large awk script be a separate file, so it doesn't core-dump # some shells. Run buildhash with the -s switch. Build and use # various hash files only if the word lists are of nonzero size. Save # the statistics files for MINIMALAFFIXES in debug mode. # # Revision 1.19 88/03/12 02:45:00 geoff # Add a comment about the uselessness of the -w option. Fix error exits # on buildhash failures to actually exit. Sort MINIMALAFFIXES correctly # (folded) so the following buildhash will run properly. Replace the final # comm step with an ispell -l step. # # # Revision 1.18 88/02/28 23:17:44 geoff # Fix an English error in a comment. # # Revision 1.17 88/02/20 23:13:33 geoff # Many major changes to support the new dictionary structure, capitalization # handling, and ispell/buildhash options. # # Revision 1.16 87/09/24 23:47:00 geoff # Allow icombine to be gotten from the current directory, so that the # script can be run before installation is complete. # # Revision 1.15 87/09/14 22:38:47 geoff # Add copyright comments # # Revision 1.14 87/07/28 22:50:23 geoff # Remove the -e switch, and add -D. Completely redo the cross-product # handling. # # Revision 1.13 87/07/20 23:23:11 geoff # Major rewrites and improvements to make it independent of the # contents of the language tables. # # Revision 1.12 87/04/24 20:32:43 geoff # Sort the input to icombine properly. # # Revision 1.11 87/04/21 23:29:07 geoff # Swap the order of the final sort and combine, so that combine is sure # to get stuff in the right order. This unfortunately increases temp file # requirements. # # Revision 1.10 87/04/19 22:53:22 geoff # Add capitalization handling. Mostly this consists of folding the sorts # and running more expand scripts. # # Revision 1.9 87/03/28 23:17:58 geoff # Get rid of a now-obsolete tr # # Revision 1.8 87/03/28 19:22:01 geoff # Fix the problem that prevented recognition of root words that were # already in the main dictionary. Also make sure every ispell is # passed the stuff from the -w option. # # Revision 1.7 87/03/27 17:21:36 geoff # Replace all the awks and one of the sorts with icombines. # # Revision 1.6 87/03/26 19:09:45 geoff # Remove DEFDICT; it's obsolete # # Revision 1.5 87/03/26 00:30:46 geoff # Integrate Rich Salz's changes/improvements # # Revision 1.4 87/03/24 22:54:42 geoff # Handle zero arguments correctly # # Revision 1.3 87/03/13 22:37:55 geoff # Add the -d option and code to strip out words covered by the main dictionary. # Also make sure that /dev/null is always used for main and personal # dictionaries so ispell doesn't get confused or start slow. # # Revision 1.2 87/03/08 20:31:16 geoff # Major changes to make faster and to make it work right (or at least better). # # Revision 1.1 87/03/01 02:20:15 geoff # Initial revision # # # Given a list of words for ispell, generate a reduced list # in which all possible affixes have been collapsed. The reduced # list will match the same list as the original. # # Usage: # # munchlist [-l lang] [-c lang] [-s hashfile] [-D] [-w chars] [file] ... # # Options: # # -l lang Specifies the language table to be used. The default # is "$LIBDIR/english.aff". # -c lang Specifies "conversion" language table. If this option is # given, the input file(s) will be assumed to be described by # this table, rather than the table given in the -l option. # This may be used to convert between incompatible language # tables. (When in doubt, use this option -- it doesn't # hurt, and it may save you from creating a dictionary that has # illegal words in it). The default is no conversion. # -T suff Specifies that the source word lists are in the format # of a "suff"-suffixed file, rather than in the # canonical form. For example, "-T tex" specifies that # string characters in the word lists are in TeX format. # The string character conversions are taken from the language # table specified by the "-l" switch. # -s Remove any words that are already covered by the # dictionary in 'hashfile'. The words will be removed # only if all affixes are covered. This option should not be # specified when the main dictionary is being munched. # 'Hashfile' must have been created with the language # table given in the -l option, but this is not checked. # -D Leave temporary files for debugging purposes # -w Passed on to ispell (specify chars that are part of a word) # Unfortunately, special characters must be quoted twice # rather than once when invoking this script. Also, since # buildhash doesn't accept this option, the final ispell -l # step ignores it, making it somewhat less than useful. # # The given input files are merged, then processed by 'ispell -c' # to generate possible affix lists; these are then combined # and reduced. The final result is written to standard output. # # For portability to older systems, I have avoided getopt. # # Geoff Kuenning # 2/28/87 # LIBDIR=/usr/lib TDIR=${TMPDIR-/usr/tmp} TMP=${TDIR}/munch$$ SORTTMP= if [ -r ./icombine ] then COMBINE=./icombine else COMBINE=${LIBDIR}/icombine fi debug=no dictopt= langtabs=${LIBDIR}/english.aff convtabs= strip=no icflags= # The following value of "wchars" is necessary to prevent ispell from # receiving a null argument if -w is not specified. As long as "A" is # a member of the existing character set, ispell will ignore the argument. wchars=-wA while [ $# != 0 ] do case "$1" in -l) if [ -r "$2" ] then langtabs="$2" elif [ -r "${LIBDIR}/$2" ] then langtabs="${LIBDIR}/$2" else echo "Can't open language table '$2'" 1>&2 exit 1 fi shift ;; -c) if [ -r "$2" ] then convtabs="$2" elif [ -r "${LIBDIR}/$2" then convtabs="${LIBDIR}/$2" else echo "Can't open conversion language table '$2'" 1>&2 exit 1 fi shift ;; -s) dictopt="-d $2" strip=yes shift ;; -D) debug=yes ;; -T) icflags="-T $2" shift ;; -w) wchars="-w$2" shift ;; --) shift break ;; -) break ;; -*) echo 'Usage: munchlist [-l lang] [-c lang] [-T suff] [-s hashfile] [-D] [-w chars] [file] ...' \ 1>&2 exit 2 ;; *) break ;; esac shift done trap "/bin/rm -f ${TMP}*; exit 1" 1 2 13 15 # # Names of temporary files. This is just to make the code a little easier # to read. # EXPANDEDINPUT=${TMP}a STRIPPEDINPUT=${TMP}b CRUNCHEDINPUT=${TMP}c PRODUCTLIST=${TMP}d EXPANDEDPAIRS=${TMP}e LEGALFLAGLIST=${TMP}f JOINEDPAIRS=${TMP}g MINIMALAFFIXES=${TMP}h CROSSROOTS=${TMP}i CROSSEXPANDED=${TMP}j CROSSPAIRS=${TMP}k CROSSILLEGAL=${TMP}l ILLEGALCOMBOS=${TMP}m FAKEDICT=${TMP}n # Ispell insists that hash files have a ".hash" suffix FAKEHASH=${TMP}o.hash AWKSCRIPT=${TMP}p if [ "$debug" = yes ] then touch $EXPANDEDINPUT $STRIPPEDINPUT $CRUNCHEDINPUT $PRODUCTLIST \ $EXPANDEDPAIRS $LEGALFLAGLIST $JOINEDPAIRS $MINIMALAFFIXES \ $CROSSROOTS $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $ILLEGALCOMBOS \ $FAKEDICT $FAKEHASH $AWKSCRIPT rm -f ${TDIR}/EXPANDEDINPUT ${TDIR}/STRIPPEDINPUT ${TDIR}/CRUNCHEDINPUT \ ${TDIR}/PRODUCTLIST ${TDIR}/EXPANDEDPAIRS ${TDIR}/LEGALFLAGLIST \ ${TDIR}/JOINEDPAIRS ${TDIR}/MINIMALAFFIXES ${TDIR}/CROSSROOTS \ ${TDIR}/CROSSEXPANDED ${TDIR}/CROSSPAIRS ${TDIR}/CROSSILLEGAL \ ${TDIR}/ILLEGALCOMBOS ${TDIR}/FAKEDICT ${TDIR}/FAKEHASH.hash \ ${TDIR}/AWKSCRIPT ln $EXPANDEDINPUT ${TDIR}/EXPANDEDINPUT ln $STRIPPEDINPUT ${TDIR}/STRIPPEDINPUT ln $CRUNCHEDINPUT ${TDIR}/CRUNCHEDINPUT ln $PRODUCTLIST ${TDIR}/PRODUCTLIST ln $EXPANDEDPAIRS ${TDIR}/EXPANDEDPAIRS ln $LEGALFLAGLIST ${TDIR}/LEGALFLAGLIST ln $JOINEDPAIRS ${TDIR}/JOINEDPAIRS ln $MINIMALAFFIXES ${TDIR}/MINIMALAFFIXES ln $CROSSROOTS ${TDIR}/CROSSROOTS ln $CROSSEXPANDED ${TDIR}/CROSSEXPANDED ln $CROSSPAIRS ${TDIR}/CROSSPAIRS ln $CROSSILLEGAL ${TDIR}/CROSSILLEGAL ln $ILLEGALCOMBOS ${TDIR}/ILLEGALCOMBOS ln $FAKEDICT ${TDIR}/FAKEDICT ln $FAKEHASH ${TDIR}/FAKEHASH.hash ln $AWKSCRIPT ${TDIR}/AWKSCRIPT fi # # Create a dummy dictionary to hold a compiled copy of the language # table. Initially, it holds the conversion table, if it exists. # case "X$convtabs" in X) convtabs="$langtabs" ;; esac echo 'QQQQQQQQ' > $FAKEDICT buildhash -s $FAKEDICT $convtabs $FAKEHASH \ || (echo "Couldn't create fake hash file" 1>&2; /bin/rm -f ${TMP}*; exit 1) \ || exit 1 # # Collect all the input and expand all the affix options (ispell -e), # and preserve (sorted) for later joining in EXPANDEDINPUT. The icombine # step is to make sure that unneeded capitalizations (e.g., Farmer and farmer) # are weeded out. The first sort must be folded for icombine; the second # must be unfolded for join. # if [ $# -eq 0 ] then ispell "$wchars" -e -d $FAKEHASH -p /dev/null | tr " " ' ' else cat "$@" | ispell "$wchars" -e -d $FAKEHASH -p /dev/null | tr " " ' ' fi \ | sort $SORTTMP -u +0f -1 +0 \ | $COMBINE $icflags $langtabs \ | sort $SORTTMP -u > $EXPANDEDINPUT # # If a conversion table existed, recreate the fake hash file with the # "real" language table. # case "$convtabs" in $langtabs) ;; *) buildhash -s $FAKEDICT $langtabs $FAKEHASH \ || (echo "Couldn't create fake hash file" 1>&2; \ /bin/rm -f ${TMP}*; exit 1) \ || exit 1 ;; esac /bin/rm -f ${FAKEDICT}* # # If the -s (strip) option was specified, remove all # expanded words that are covered by the dictionary. This produces # the final list of expanded words that this dictionary must cover. # Leave the list in STRIPPEDINPUT. # if [ "X$strip" = "Xno" ] then rm -f $STRIPPEDINPUT ln $EXPANDEDINPUT $STRIPPEDINPUT if [ "$debug" = yes ] then rm -f ${TDIR}/STRIPPEDINPUT ln $STRIPPEDINPUT ${TDIR}/STRIPPEDINPUT fi else ispell "$wchars" -l $dictopt -p /dev/null < $EXPANDEDINPUT \ > $STRIPPEDINPUT fi # # Figure out what the flag-marking character is. # flagmarker=`ispell -D -d $FAKEHASH \ | sed -n '/^flagmarker/s/flagmarker //p'` case "$flagmarker" in \\*) flagmarker=`expr "$flagmarker" : '.\(.\)'` ;; esac # # Munch the input to generate roots and affixes (ispell -c). We are # only interested in words that have at least one affix (egrep $flagmarker); # the next step will pick up the rest. Some of the roots are illegal. We # use join to restrict the output to those root words that are found # in the original dictionary. # ispell "$wchars" -c -W0 -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT \ | tr " " ' ' \ | egrep "$flagmarker" | sort $SORTTMP -u "-t$flagmarker" +0 -1 +1 \ | join "-t$flagmarker" - $EXPANDEDINPUT > $CRUNCHEDINPUT # # We now have a list of legal roots, and of affixes that apply to the # root words. However, it is possible for some affix flags to generate more # than one output word. For example, with the flag table entry # # flag R: . > ER # . > ERS # # the input "BOTHER" will generate an entry "BOTH/R" in CRUNCHEDINPUT. But # this will accept "BOTHER" and "BOTHERS" in the dictionary, which is # wrong (in this case, though it's good English). # # To cure this problem, we first have to know which flags generate which # expansions. We use ispell -ee to expand the flags (the second e causes # the root and flag to be included in the output), and run the result of # that through sed to get pairs suitable for joining. In the example # above, we would get # # BOTH/R BOTHER # BOTH/R BOTHERS # # We save this in EXPANDEDPAIRS for the next step. # # First, a small sed script that we'll use a lot. It takes "a b c d" and # produces "a b", "a c", and "a d". Lines without blanks are ignored. # PAIRUP='/ /{ s;^\([^ ]*\) \([^ ]*\)\(.*\);\1 \2\ \1\3; P D }' ispell "$wchars" -ee -d $FAKEHASH -p /dev/null < $CRUNCHEDINPUT \ | sed -n "$PAIRUP" \ | sort $SORTTMP +1 > $EXPANDEDPAIRS # # Now we want to extract the lines in EXPANDEDPAIRS in which the second field # is *not* listed in the original dictionary EXPANDEDINPUT; these illegal # lines contain the flags we cannot include without accepting illegal words. # It is somewhat easier to extract those which actually are listed (with # join), and then use comm to strip these from EXPANDEDPAIRS to get the # illegal expansions, together with the flags that generate them (we must # re-sort EXPANDEDPAIRS before running comm). Sed # gets rid of the expansion and uniq gets rid of duplicates. Comm then # selects the remainder of the list from CRUNCHEDINPUT and puts it in # LEGALFLAGLIST. The final step is to use a sort and icombine to put # the list into a one-entry-per-root format. # # BTW, I thought of using cut for the sed step (on systems that have it), # but it turns out that sed is faster! # join -j1 2 -o 1.1 1.2 $EXPANDEDPAIRS $EXPANDEDINPUT \ | sort $SORTTMP -u > $JOINEDPAIRS sort $SORTTMP -o $EXPANDEDPAIRS $EXPANDEDPAIRS comm -13 $JOINEDPAIRS $EXPANDEDPAIRS \ | (sed -e 's; .*$;;' ; /bin/rm -f $JOINEDPAIRS $EXPANDEDPAIRS) \ | uniq \ | (comm -13 - $CRUNCHEDINPUT ; /bin/rm -f $CRUNCHEDINPUT) \ | sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 \ | $COMBINE $langtabs > $LEGALFLAGLIST # # LEGALFLAGLIST now contains root/flag combinations that, when expanded, # produce only words from EXPANDEDPAIRS. However, there is still a # problem if the language tables have any cross-product flags. A legal # root may appear in LEGALFLAGLIST with two flags that participate # in cross-products. When such a dictionary entry is expanded, # the cross-products will generate some extra words that may not # be in EXPANDEDPAIRS. We need to remove these from LEGALFLAGLIST. # # The first step is to collect the names of the flags that participate # in cross-products. Ispell will dump the language tables for us, and # sed is a pretty handy way to strip out extra information. We use # uniq -c and a numerical sort to put the flags in approximate order of how # "productive" they are (in terms of how likely they are to generate a lot # of output words). The least-productive flags are given last and will # be removed first. # ispell -D -d $FAKEHASH \ | sed -n '1,$s/:.*$// /^flagmarker/d /^prefixes/,/^suffixes/s/^ flag \*/p /p /^suffixes/,$s/^ flag \*/s /p' \ | sort $SORTTMP \ | uniq -c \ | sort $SORTTMP +0rn -1 +2 > $PRODUCTLIST if [ `egrep ' p ' $PRODUCTLIST | wc -l` -gt 0 \ -a `egrep ' s ' $PRODUCTLIST | wc -l` -gt 0 ] then # # The language tables allow cross products. See if LEGALFLAGLIST has # any roots with multiple cross-product flags. Put them in CROSSROOTS. # preflags=`sed -n 's/^[ 0-9]*p //p' $PRODUCTLIST | tr -d ' '` sufflags=`sed -n 's/^[ 0-9]*s //p' $PRODUCTLIST | tr -d ' '` egrep "$flagmarker.*[$preflags].*[$sufflags]|$flagmarker.*[$sufflags].*[$preflags]" \ $LEGALFLAGLIST \ > $CROSSROOTS # # We will need an awk script; it's so big that it core-dumps my shell # under certain conditions. The rationale behind the script is commented # where the script is used. Note that you may want to change this # script for languages other than English. # case "$flagmarker" in /) sedchar=: ;; *) sedchar=/ ;; esac sed -e "s/PREFLAGS/$preflags/" -e "s/SUFFLAGS/$sufflags/" \ -e "s;ILLEGALCOMBOS;$ILLEGALCOMBOS;" \ -e "s${sedchar}FLAGMARKER${sedchar}$flagmarker${sedchar}" \ > $AWKSCRIPT << 'ENDOFAWKSCRIPT' BEGIN \ { preflags = "PREFLAGS" sufflags = "SUFFLAGS" illegalcombos = "ILLEGALCOMBOS" flagmarker = "FLAGMARKER" pflaglen = length (preflags) for (i = 1; i <= pflaglen; i++) pflags[i] = substr (preflags, i, 1); sflaglen = length (sufflags) for (i = 1; i <= sflaglen; i++) sflags[i] = substr (sufflags, i, 1); } { len = length ($2) pnew2 = "" snew2 = "" pbad = "" sbad = "" sufs = 0 pres = 0 for (i = 1; i <= len; i++) { curflag = substr ($2, i, 1) for (j = 1; j <= pflaglen; j++) { if (pflags[j] == curflag) { pres++ pnew2 = substr ($2, 1, i - 1) substr ($2, i + 1) pbad = curflag } } for (j = 1; j <= sflaglen; j++) { if (sflags[j] == curflag) { sufs++ snew2 = substr ($2, 1, i - 1) substr ($2, i + 1) sbad = curflag } } } if (pres == 1) { print $1 flagmarker pnew2 print $1 flagmarker pbad >> illegalcombos } else if (sufs == 1) { print $1 flagmarker snew2 print $1 flagmarker sbad >> illegalcombos } else if (pres > 0) { print $1 flagmarker pnew2 print $1 flagmarker pbad >> illegalcombos } else { print $1 flagmarker snew2 print $1 flagmarker sbad >> illegalcombos } } ENDOFAWKSCRIPT : > $ILLEGALCOMBOS dbnum=0 while [ -s $CROSSROOTS ] do # # CROSSROOTS contains the roots whose cross-product expansions # might be illegal. We now need to locate the actual illegal ones. # We do this in much the same way we created LEGALFLAGLIST from # CRUNCHEDINPUT. First we make CROSSEXPANDED, which is analogous # to EXPANDEDPAIRS. # ispell "$wchars" -ee -d $FAKEHASH -p /dev/null < $CROSSROOTS \ | sed -n "$PAIRUP" \ | sort $SORTTMP +1 > $CROSSEXPANDED # # Now we join CROSSEXPANDED against EXPANDEDINPUT to produce # CROSSPAIRS, and then comm that against CROSSEXPANDED to # get CROSSILLEGAL, the list of illegal cross-product flag # combinations. # join -j1 2 -o 1.1 1.2 $CROSSEXPANDED $EXPANDEDINPUT \ | sort $SORTTMP -u > $CROSSPAIRS sort $SORTTMP -u -o $CROSSEXPANDED $CROSSEXPANDED comm -13 $CROSSPAIRS $CROSSEXPANDED \ | sed -e 's; .*$;;' \ | uniq > $CROSSILLEGAL if [ "$debug" = yes ] then mv $CROSSROOTS $TDIR/CROSSROOTS.$dbnum rm -f $TDIR/CROSSEXP.$dbnum $TDIR/CROSSILLEGAL.$dbnum \ $TDIR/CROSSEXP.$dbnum ln $CROSSPAIRS $TDIR/CROSSPAIRS.$dbnum ln $CROSSILLEGAL $TDIR/CROSSILLEGAL.$dbnum fi # # Now it is time to try to clear up the illegalities. For # each word in the illegal list, remove one of the cross-product # flags. The flag chosen is selected in an attempt to cure the # problem quickly, as follows: (1) if there is only one suffix # flag or only one prefix flag, we remove that. (2) If there is # a prefix flag, we remove the "least desirable" (according to # the order of preflags). (This may be pro-English prejudice, # and you might want to change this if your language is prefix-heavy). # (3) Otherwise we remove the least-desirable suffix flag # # The output of the awk script becomes the new CROSSROOTS. In # addition, we add the rejected flags to ILLEGALCOMBOS (this is done # inside the awk script) so they can be removed from LEGALFLAGLIST # later. # awk "-F$flagmarker" -f $AWKSCRIPT $CROSSILLEGAL > $CROSSROOTS if [ "$debug" = yes ] then /bin/rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL dbnum=`expr $dbnum + 1` fi done /bin/rm -f $CROSSEXPANDED $CROSSPAIRS $CROSSILLEGAL $AWKSCRIPT # # Now we have, in ILLEGALCOMBOS, a list of root/flag combinations # that must be removed from LEGALFLAGLIST to get the final list # of truly legal flags. ILLEGALCOMBOS has one flag per line, so # by turning LEGALFLAGLIST into this form (sed), it's an # easy task for comm. We have to recombine flags again after the # extraction, to get all flags for a given root on the same line so that # cross-products will come out right. # if [ -s $ILLEGALCOMBOS ] then sort $SORTTMP -u -o $ILLEGALCOMBOS $ILLEGALCOMBOS sort $SORTTMP $LEGALFLAGLIST \ | sed '/\/../{ s;^\(.*\)/\(.\)\(.*\);\1/\2\ \1/\3; P D }' \ | comm -23 - $ILLEGALCOMBOS \ | sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 \ | $COMBINE $langtabs > $CROSSROOTS mv $CROSSROOTS $LEGALFLAGLIST if [ "$debug" = yes ] then rm -f ${TDIR}/LEGALFLAGLIST1 ln $LEGALFLAGLIST ${TDIR}/LEGALFLAGLIST1 fi fi fi /bin/rm -f $PRODUCTLIST $CROSSROOTS $ILLEGALCOMBOS $EXPANDEDINPUT # # We now have (in LEGALFLAGLIST) a list of roots and flags which will # accept words taken from EXPANDEDINPUT and no others (though some of # EXPANDEDINPUT is not covered by this list). However, many of the # expanded words can be generated in more than one way. For example, # "bather" can be generated from "bath/R" and "bathe/R". This wastes # unnecessary space in the raw dictionary and, in some cases, in the # hash file as well. The solution is to list the various ways of # getting a given word and choose exactly one. All other things being # equal, we want to choose the one with the shortest root and the most # flags. The awk script takes care of this by providing us with a field # to sort on. # # The ispell/awk combination is similar to the ispell/sed pipe # used to generate EXPANDEDPAIRS, except that the awk adds an extra # field giving the sort order. The first sort gets things in order # so the first root listed is the one we want, and the second sort (-um) # then selects that first root. Sed strips the expansion from the root, # and a final sort -u generates MINIMALAFFIXES, the final list of affixes # that (more or less) minimally covers what it can from EXPANDEDINPUT. # # Incidentally, the sed in the pipe below is necessary only because # some versions of awk can't handle large numbers of fields (e.g., # over 100). Otherwise we could just loop over the fields. # ispell "$wchars" -ee -d $FAKEHASH -p /dev/null < $LEGALFLAGLIST \ | sed -n "$PAIRUP" \ | awk ' { rootl = index ($1, "'"$flagmarker"'") nflags = length (substr ($1, rootl)) - 1 rootl-- print $1, $2, rootl, nflags }' \ | sort $SORTTMP +1 -2 +2n -3 +3rn -4 +0 -1 \ | sort $SORTTMP -um +1 -2 \ | sed -e 's; .*$;;' \ | sort $SORTTMP -u "-t$flagmarker" +0f -1 +0 > $MINIMALAFFIXES /bin/rm -f $LEGALFLAGLIST # # Now we're almost done. MINIMALAFFIXES covers some (with luck, most) # of the words in STRIPPEDINPUT. Now we must create a list of the remaining # words (those omitted by MINIMALAFFIXES) and add it to MINIMALAFFIXES. # The best way to do this is to actually build a partial dictionary from # MINIMALAFFIXES in FAKEHASH, and then use ispell -l to list the words that # are not covered by this dictionary. This must then be combined with the # reduced version of MINIMALAFFIXES and sorted to produce the final result. # if [ -s $MINIMALAFFIXES ] then buildhash -s $MINIMALAFFIXES $langtabs $FAKEHASH > /dev/null \ || (echo "Couldn't create intermediate hash file" 1>&2; /bin/rm -f ${TMP}*; exit 1) \ || exit 1 if [ "$debug" = yes ] then rm -f ${TDIR}/MINAFFIXES.cnt ${TDIR}/MINAFFIXES.stat ln $MINIMALAFFIXES.cnt ${TDIR}/MINAFFIXES.cnt ln $MINIMALAFFIXES.stat ${TDIR}/MINAFFIXES.stat fi (ispell "$wchars" -l -d $FAKEHASH -p /dev/null < $STRIPPEDINPUT; \ $COMBINE $langtabs < $MINIMALAFFIXES) \ | sort $SORTTMP "-t$flagmarker" -u +0f -1 +0 else # MINIMALAFFIXES is empty; just produce a sorted version of STRIPPEDINPUT sort $SORTTMP "-t$flagmarker" -u +0f -1 +0 $STRIPPEDINPUT fi /bin/rm -f ${TMP}*