#!/usr/bin/mawk -We
# *********************************************************************
# jointable: joins two NoSQL tables on a common field.
#
# Copyright (c) 1998,1999,2000,2001,2002,2003 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
#  $Id: jointable,v 1.4 2003/05/23 20:31:53 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  # Separate join(1) options from column names.

  while (ARGV[++i] != NULL) {
    # Turn long options into their short form.
    if (ARGV[i] == "--all") ARGV[i] = "-a"
    else if (ARGV[i] == "--column") ARGV[i] = "-j"
    else if (ARGV[i] == "--debug") ARGV[i] = "-x"
    else if (ARGV[i] == "--help") ARGV[i] = "-h"
    else if (ARGV[i] == "--numeric") ARGV[i] = "-n"
    else if (ARGV[i] == "--ignore-case") ARGV[i] = "-i"
    else if (ARGV[i] == "--all") ARGV[i] = "-a 1"

    # Now process each option in turn.
    if (ARGV[i] == "-j") jcol = ARGV[++i]
    else if (ARGV[i] == "-x") debug = 1
    else if (ARGV[i] == "-h") {
       system("grep -v '^#' " nosql_install "/help/jointable.txt")
       rc = 0
       exit(rc)
    }
    else if (ARGV[i] == "-n") join_args = join_args " " ARGV[i]
    else if (ARGV[i] == "-i") join_args = join_args " " ARGV[i]
    else if (ARGV[i] !~ /^-/ || ARGV[i] == "-") tbl[++j] = ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  if (j != 2) {
     print "usage: jointable [options] table_1 table_2" > stderr
     rc = 1
     exit(rc)
  }

  # Only one table may be on stdin.
  if (tbl[1] == "-" && tbl[2] == "-") {
     print "jointable: only one table can be on STDIN" > stderr
     rc = 1
     exit(rc)
  }

  # Get column names of table that is _not_ on stdin.
  if (tbl[1] != "-") {
    getline < tbl[1]        	# Column names
    gsub(/\001/,"")		# Remove SOH markers.

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # positions (it may happen after a join).

    while (++p <= NF) {
      if (P1[$p] == NULL) P1[$p] = p
      N1[p] = $p
    }
  }
  else {
    getline < tbl[2]		# Column names
    gsub(/\001/,"")		# Remove SOH markers

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # names (it may happen after a join).

    while (++p <= NF) {
      if (P2[$p] == NULL) P2[$p] = p
      N2[p] = $p
    }
  }
}

#
# Main loop
#

# Get column names of table that _is_ on stdin.
NR == 1 {
  gsub(/\001/,"")		# Remove SOH markers
  p = 0
  if (tbl[1] == "-") {
    out_hdr = $0

    # Get column names and positions.
    # Make sure we pick the first occurrence of duplicated column
    # positions (it may happen after a join).

    while (++p <= NF) {
      if (P1[$p] == NULL) P1[$p] = p
      N1[p] = $p
    }
  }
  else {
    out_hdr = $0

    # Get column names and positions.
    while (++p <= NF) { P2[$p] = p; N2[p] = $p }
  }

  # Set default join column (always from table_1) if not specified.

  if (jcol == NULL) {
     jcol = N1[1]
     j1 = j2 = 1
  }
  else {
     j1 = P1[jcol]
     j2 = P2[jcol]
  }

  # Complain if the requested column does not exist in either table.

  if (j1 == NULL) {
     print "jointable: column '" jcol "' not found in table " tbl[1] > stderr
     rc = 1
     exit(rc)
  }

  if (j2 == NULL) {
     print "jointable: column '" jcol "' not found in table " tbl[2] > stderr
     rc = 1
     exit(rc)
  }

  # Build output field list for join(1).
  field_list = " 1." P1[N1[j1]]

  while (N1[++c] != NULL) {
    if (N1[c] !="." && c != j1)
      field_list = field_list ",1." P1[N1[c]]
  }

  c = 0
  while (N2[++c] != NULL) {
    if (N2[c] != "." && c != j2)
      field_list = field_list ",2." P2[N2[c]]
  }

  join_args = join_args " -o " field_list
  join_args = join_args " -1 " j1 " -2 " j2 " " tbl[1] " " tbl[2]

  # Using "exec" saves another 2-5 msecs.
  join_cmd = "LC_ALL=POSIX exec join -t \"\t\" " join_args

  if (debug) print join_cmd > stderr

  # Print header.
  gsub(/\t/,"\t\001", out_hdr)
  cmd_is_open = 1
  print "\001" out_hdr |join_cmd
}

NR > 1 { print |join_cmd }

END {
  if (rc) exit(rc)
  if (cmd_is_open) rc = close(join_cmd)
  exit(rc)				# Return join(1) exit status.
}

# End of program
