#
# Generates the language model for a 5k general business vocabulary
#
# uses:
#       wsj5k.txt         - transcript of wsj
#       5k_words.vocab - hand prepared vocabulary list
#
# generates:
#       wsj5k.lm - arpa format of the language model
#       wsj5k.DMP - CMU binary format of the language model
#
#       
# requires:
#       CMU language model toolkit:
#               http://www.speech.cs.cmu.edu/SLM_info.html
#       lm3g2dmp - utility to generate DMP format models:
#           http://cmusphinx.sourceforge.net/webpage/html/download.php#utilities#
# unix commands:
#       gawk mv rmdir rm tr
#
# All commands should be in your path
#

# Convert transcript to all lower case
tr "[A-Z]" "[a-z]" < wsj5k.txt > wsj5k.lc.tmp

# normalize the tags
gawk -f prep.awk < wsj5k.lc.tmp > wsj5k.prep.tmp

#
# Generate the word frequencies
#

text2wfreq < wsj5k.prep.tmp > wsj5k.wfreq.tmp

#
# Generate the vocabulary (this should be a subset wsj5k.vocab)
#

wfreq2vocab  -top 5000 < wsj5k.wfreq.tmp > wsj5k.vocab.tmp

#
# Generate the idngram

#text2idngram -vocab $file.tmp.vocab < $file  > $file.tmp.idngram
# uses the custom word list
text2idngram -vocab 5k_words.vocab < wsj5k.prep.tmp  > wsj5k.idngram.tmp


#
# generates the language model

#idngram2lm -vocab_type 0 -idngram $file.tmp.idngram -vocab $file.tmp.vocab -arpa $file.arpa
#idngram2lm  -idngram $file.tmp.idngram -vocab $file.tmp.vocab -arpa $file.arpa
#idngram2lm  -witten_bell -idngram $file.tmp.idngram -vocab $file.tmp.vocab -arpa $file.arpa
#idngram2lm  -two_byte_alphas -idngram $file.tmp.idngram -vocab $file.tmp.vocab -arpa $file.arpa

idngram2lm  -vocab_type 0 -idngram wsj5k.idngram.tmp -vocab 5k_words.vocab -arpa wsj5k.lm


#
# generate the DMP version of the language model
#

mkdir dmp
lm3g2dmp wsj5k.lm dmp
mv dmp/wsj5k.lm.DMP wsj5k.DMP

#
# cleanup
#

rmdir dmp
rm *.tmp

