#!/usr/bin/mawk -We
# *********************************************************************
#  Written by and copyright Carlo Strozzi <carlos@linux.it>.
#
#  soundex: classifies input values into categories, using Knuth's
#  soundex codes, useful for building a table secondary index.
#  Copyright (C) 2001-2003 Carlo Strozzi <carlos@linux.it>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
#  $Id: soundex,v 1.2 2003/03/17 10:10:46 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"; id = "soundex"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  while (ARGV[++i] != NULL) {
    if (ARGV[i] == "-l" || ARGV[i] == "--last") pick_last = 1
    else if (ARGV[i] == "-N" || ARGV[i] == "--no-header") no_hdr = 1
    else if (ARGV[i] == "-s" || ARGV[i] == "--scolumn") id = ARGV[++i]
    else if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (ARGV[i] == "-h" || ARGV[i] == "--help") {
       system("grep -v '^#' " nosql_install "/help/soundex.txt")
       rc = 1
       exit(rc)
    }
    else target_cols[ARGV[i]] = ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }
}

#
# Main loop
#

NR == 1 {

  gsub(/\001/, "")			# Remove SOH markers

  # Load the column position array.
  while (++p <= NF) {

    # Unless '-l' was specified, make sure we pick the first occurrence
    # of duplicated column names (it may happen after a join).

    if (P[$p] == NULL) auto_col = auto_col " " $p

    if (pick_last) { P[$p] = p; N[p] = $p }
    else {
      if (P[$p] == NULL) { P[$p] = p; N[p] = $p }
    }
  }

  $0 = id OFS $0			# Insert the Soundex column.
  if (!no_hdr) {
     printf("\001"); gsub(/\t/,"\t\001"); print > o_file
  }
  next
}

# Table body.
{
  out = k = NULL
  for (i=1; i<=NF; i++) {
      if (i > 1) out = out OFS
      if (target_cols[N[i]] != NULL) {
	 k = k $i
      }
      out = out $i
  }
  print soundex(k) OFS out > o_file
}

#
# Function section.
#

function soundex(string,	field,a,i,j) {

   field = toupper(string)
   gsub(/[^A-Z]/,NULL,field)			# Remove non-letters.
   string = sprintf("%s", substr(field,1,1))
   if (field == NULL) return NULL
   i = split(field, a, NULL)

   for (j=2; j < i; j++) {
       if (a[j] ~ /[BFV]/) string = sprintf("%s%d", string, 1)
       else if (a[j] ~ /[CGKQSXZ]/) string = sprintf("%s%d", string, 2)
       else if (a[j] ~ /[DT]/) string = sprintf("%s%d", string, 3)
       else if (a[j] == "L") string = sprintf("%s%d", string, 4)
       else if (a[j] ~ /[MN]/) string = sprintf("%s%d", string, 5)
       else if (a[j] == "R") string = sprintf("%s%d", string, 6)
   }
   return substr(sprintf("%s000", string),1,4)
}

#
# End of program.
#
