#!/usr/bin/mawk -We
# *********************************************************************
# nblparser: experimental NoSQL Brokering Language (NBL) interpreter.
#
# Copyright (c) 2003 Carlo Strozzi
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# *********************************************************************
# $Id: nblparser,v 1.2 2003/10/09 10:53:30 carlo Exp $

BEGIN {
  NULL = ""; FS = OFS = "\t"; tmptable_cmd = "tmptable"

  # Get local settings.
  nosql_install = ENVIRON["NOSQL_INSTALL"]
  stdout = ENVIRON["NOSQL_STDOUT"]
  stderr = ENVIRON["NOSQL_STDERR"]

  # Set default values if necessary.
  if (nosql_install == NULL) nosql_install = "/usr/local/nosql"
  if (stdout == NULL) stdout = "/dev/stdout"
  if (stderr == NULL) stderr = "/dev/stderr"

  # Process command-line options and arguments.

  while (ARGV[++i] != NULL) {
    # Turn long options into their short form.
    if (ARGV[i] == "-i" || ARGV[i] == "--input") i_file = ARGV[++i]
    else if (ARGV[i] == "-o" || ARGV[i] == "--output") o_file = ARGV[++i]
    else if (ARGV[i] == "-d" || ARGV[i] == "--delete") {
       tmptable_cmd = "tmptable --delete " ARGV[++i]
    }
    else if (ARGV[i] == "-U" || ARGV[i] == "--unsafe") unsafe = 1
    else if (ARGV[i] == "-n" || ARGV[i] == "--no-hide") nohide = 1
    else if (ARGV[i] == "-h") {
       system("grep -v '^#' " nosql_install "/help/nblparser.txt")
       exit(rc=0)
    }
    else s_file = ARGV[i]
  }

  ARGC = 1					# Fix argv[]

  if (o_file == NULL) o_file = stdout
  if (i_file != NULL) { ARGV[1] = i_file; ARGC = 2 }

  if (s_file == "") {
     print "usage: nblparser [options] schema" > stderr
     exit(rc=1)
  }

  # Load schema into memory
  i=0
  if ((getline < s_file) <= 0) {
     print "nblparser: could not read schema file '"s_file"'" > stderr
     exit(rc=1)
  }

  # Remove SOH markers.
  gsub(/\001/, "")

  # Load the column position array.
  i=0; while (++i <= NF) p[$i] = i

  # check for required fields.
  if (!p["Table"] || !p["Path"] || !p["Ext"] || !p["Key"]) {
     print "nblparser: missing field(s) in schema file '"s_file"'" > stderr
     exit(rc=1)
  }

  i=0
  while ((getline schema[++i] < s_file) > 0) {
    split(schema[i],a,"\t")
    if (a[p["Hide"]] != "") hide = hide " " a[p["Hide"]]
  }
  close(s_file);
  schema[0] = i				# save array length

  FS = " "				# set default FS

  print "#!/bin/sh\nset -e" > o_file
}

# Read input NBL statements.

# skip comments and empty lines.
/^[ \t]*(#|$)/ { next }

# add new NBL verbs as needed.

$1 == "set"		{ handle_set();			next }
$1 == "read"		{ handle_read();		next }
$1 == "expression"	{ handle_expr($1);		next }
$1 == "select"		{ handle_expr($1);		next }
$1 == "compute"		{ handle_expr($1);		next }
$1 == "columns"		{ handle_column();		next }
$1 == "join"		{ handle_join();		next }
$1 == "@join"		{ handle_join("outer");		next }
$1 == "order-by"	{ handle_orderby();		next }
$1 == "@order-by"	{ handle_orderby("revert");	next }
$1 == "remember-as"	{ handle_remember();		next }
$1 == "totals"		{ handle_totals();		next }
$1 == "@totals"		{ handle_totals("currency");	next }
$1 == "format"		{ handle_format();		next }
$1 == "@format"		{ handle_format("list");	next }
$1 == "system"		{ handle_system();		next }
$1 == "end"		{ exit(0) }

# ending stuff.
END {
  if (rc) exit(rc)

  # debug
  #if (schema_row[1] != "") {
  #   printf("#") > o_file
  #   i=0
  #   while (schema_row[++i] != "") printf(" %s", schema_row[i]) > o_file
  #   printf("\n") > o_file
  #}

  if (!nohide && hide != "")
     sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\nnotcolumn " hide

  i=0
  while (sh_cmd[++i] != "") print sh_cmd[i] > o_file
}

function check_cmd(handler) {

  if (schema_row[p["Table"]] == "") {
     print "nblparser: NBL '" handler "': no active table, use 'read' first" > stderr
     exit(rc=1)
  }
}

# NBL handlers

function handle_set(			i, target) {

  if (NF != 2 && NF != 3) {
     print "nblparser: NBL usage: set name [value]" > stderr
     exit(rc=1)
  }

  if ($2 !~ /^[A-Za-z_][A-Za-z0-9_]*$/) {
     print "nblparser: NBL 'set': bad name in assignment" > stderr
     exit(rc=1)
  }

  # The value part must be acceptable as a directory or file name,
  # with no other path components.
  if ($3 != "" && ($3 ~ /^\.\.$/ || $3 !~ /^[-_.,=:+A-Za-z0-9]*$/)) {
     print "nblparser: NBL 'set': bad value in assignment" > stderr
     exit(rc=1)
  }

  # replace value in schema "Path" field.

  target = "\$\[" $2 "\]"
  while (++i <= schema[0]) gsub(target,$3,schema[i])
}


function handle_read(		i,a,sorted,k,pbc,file,partial,cmd) {

  if (schema_row[p["Table"]] != "") {
     print "nblparser: NBL 'read': cannot open multiple tables" > stderr
     exit(rc=1)
  }

  if (NF < 2 || NF > 3) {
     print "nblparser: NBL usage: read table [key]" > stderr
     exit(rc=1)
  }

  delete schema_row

  # a partial key string must begin with a "@" sign.
  partial = sub(/^@/,"",$3)

  if ($2 !~ /^\$?[A-Za-z_][A-Za-z0-9_]*$/) {
     print "nblparser: NBL 'read': bad table or variable name" > stderr
     exit(rc=1)
  }

  # check whether variable name.

  if ($2 ~ /^\$/) {

     schema_row[p["Table"]] = $2	# needed by other handlers

     # we assume that "set -e" is used in the output sh(1) script,
     # so that if test(1) fails the script terminates.

     file = "\"" $2 "\""
     cmd = "test -f " file "; "
  }

  if ($3 != "") {
     if ($3 !~ /^[A-Za-z0-9]+$/) {
     	print "nblparser: NBL 'read': bad key specified" > stderr
     	exit(rc=1)
     }
  }

  # lookup target table in schema, unless "$table"

  if (file == "") {

    while (++i <= schema[0]) {
       split(schema[i],a,"\t")
       if (a[p["Table"]] == $2) {
	  split(schema[i],schema_row,"\t")
	  break				# bail-out if table found
       }
    }

    if (schema_row[p["Table"]] == "") {
       print "nblparser: NBL 'read': unknown table '" $2 "'" > stderr
       exit(rc=1)
    }

    if (schema_row[p["Path"]] ~ /\$\[/) {
       print "nblparser: NBL 'read': variable tokens in path name, use 'set' first" > stderr
       exit(rc=1)
    }

    file = schema_row[p["Path"]]

    # tables that are not sorted on the leftmost field should have
    # the schema Key field that begins with one or more hyphens,
    # followed by an optional colon.

    sorted = !sub(/^-+\/?/,"",schema_row[p["Key"]])

    if (schema_row[p["Key"]] != "") {
       if (schema_row[p["Path"]] !~ /\/[ \t]*$/) {
	  print "nblparser: NBL 'read': invalid schema Path/Key pair" > stderr
	  exit(rc=1)
       }

       if (schema_row[p["Key"]] !~ /^[1-9,\/]+$/) {
	  print "nblparser: NBL 'read': invalid schema Key field" > stderr
	  exit(rc=1)
       }

       split(schema_row[p["Key"]],a,"/")

       i=0
       while (a[++i] != "") {
         if (split(a[i],b,",") != 2) {
	    print "nblparser: NBL 'read': bad schema Key field" > stderr
	    exit(rc=1)
         }
         k = substr($3,b[1],b[2])
         while (length(k) < b[2]) k = k "?"	# pad with wildcards
         pbc = pbc "/" k
       }

       sub (/^\//,"",pbc)			# strip leading slash
       file = schema_row[p["Path"]] pbc
    }
    file = file schema_row[p["Ext"]]
  }
	
  if (file ~ /\?/) {
     if ($3 != "") {
	if (partial) sh_cmd[++sh_cmd[0]] = "union " file " |\n" \
		"awktable -H '$1 ~ /^" $3 "/'"
	else sh_cmd[++sh_cmd[0]] = "union " file " |\n" \
		"awktable -H '$1 == \"" $3 "\"'"
     }
     else sh_cmd[++sh_cmd[0]] = "union " file
  }
  else if ($3 != "") {
     if (partial) {
	if (sorted) sh_cmd[++sh_cmd[0]] = cmd "keysearch -p '" $3 "' " file
	else sh_cmd[++sh_cmd[0]] = cmd "awktable -H '$1 ~ /^" $3 "/' < " file
     }
     else if (sorted) sh_cmd[++sh_cmd[0]] = cmd "keysearch '" $3 "' " file
     else sh_cmd[++sh_cmd[0]] = cmd "awktable -H '$1 == \"" $3 "\"' < " file
  }
  else sh_cmd[++sh_cmd[0]] = cmd "cat " file
}


function handle_expr(what,		a,i,j,regexp) {

  check_cmd(what)

  if (NF < 2) {
     print "nblparser: NBL usage: " what " statements" > stderr
     exit(rc=1)
  }

  regexp = "^" what "[ \t]+"
  sub(regexp,"")

  if (what == "select") what = "row"

  # Forbid AWK's dangerous instructions, if necessary. The following
  # is a rough check, which may be stricter than necessary in some cases,
  # but better to be safe than sorry.

  if (!unsafe) {

     i = split($0,a,/[^a-z]+/)

     while (++j <= i) {
       if (a[j] ~ /^(printf?|getline|system)$/) {
	  print "nblparser: unsafe AWK instruction specified" > stderr
	  exit(rc=1)
       }
     }
  }

  # escape sh(1) special characters in statements.

  gsub(/\\/, "\\\\\\")
  gsub(/\$/, "\\$")
  gsub(/`/, "\\`")
  gsub(/"/,"\\\"")

  if (what == "expression") {
     what = "awktable -H"
     $0 = $0 "{print}"
  }
  sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\n" what " \"" $0 "\""
}


function handle_column(			i,a) {

  check_cmd("columns")

  if (NF < 2) {
     print "nblparser: NBL usage: columns col [col ...]" > stderr
     exit(rc=1)
  }

  sub(/^columns[ \t]+/,"")
  split($0,a)
  while (a[++i] != "") {
     if (a[i] !~ /^[A-Za-z_][A-Za-z0-9_]*$/) {
	print "nblparser: NBL 'columns': bad column names(s) specified" > stderr
	exit(rc=1)
     }
  }

  sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\ncolumn " $0
}


function handle_join(outer,		i,a,b,k,sorted,partial,pbc,\
					join_row,tbl,cmd,pk,sort,file) {

  if (outer != "") check_cmd("@join")
  else check_cmd("join")

  if (NF < 3 || NF > 5) {
     print "nblparser: NBL usage: [@]join table1 table2 [[@]column key]" > stderr
     exit(rc=1)
  }

  # set default arguments

  if ($3 == "") $3 = "-"
  if ($4 == "") $4 = "-"

  # either one or the other table, but not both, must be on stdin.

  if (($2 == "-" && $3 == "-") || ($2 != "-" && $3 != "-")) {
     print "nblparser: NBL 'join': one (and only one) of the input tables must be on stdin" > stderr
     exit(rc=1)
  }

  if (($2 != "-" && $2 !~ /^@?\$?[A-Za-z_][A-Za-z0-9_]*$/) || \
      ($3 != "-" && $3 !~ /^@?\$?[A-Za-z_][A-Za-z0-9_]*$/)) {
     print "nblparser: NBL 'join': bad table or variable name" > stderr
     exit(rc=1)
  }

  cmd = "jointable"
  if (outer != "") cmd = cmd " --all"

  if ($2 != "-") tbl = $2
  else tbl = $3

  # a partial key string must begin with a "@" sign.
  partial = sub(/^@/,"",$5)

  # check whether variable name.

  if (tbl ~ /^\$/) {

     # we assume that "set -e" is used in the output sh(1) script,
     # so that if test(1) fails the script terminates.

     file = "\"" tbl "\""
  }

  # lookup target table in schema, unless "$table"

  if (file == "") {

    while (++i <= schema[0]) {
       split(schema[i],a,"\t")
       if (a[p["Table"]] == tbl) {
	  split(schema[i],join_row,"\t")
	  break					# bail-out if table found
       }
    }

    if (join_row[p["Table"]] == "") {
       print "nblparser: NBL 'join': unknown table '" tbl "'" > stderr
       exit(rc=1)
    }

    if (join_row[p["Path"]] ~ /\$\[/) {
       print "nblparser: NBL 'join': variable tokens in file name, use 'set' first" > stderr
       exit(rc=1)
    }

    file = join_row[p["Path"]]

    # tables that are not sorted on the leftmost field should have
    # the schema Key field that begins with -[:].

    sorted = !sub(/^-\/?/,"",join_row[p["Key"]])

    if (join_row[p["Key"]] != "") {
       if (join_row[p["Path"]] !~ /\/[ \t]*$/) {
	  print "nblparser: NBL 'join': invalid schema Path/Key pair" > stderr
	  exit(rc=1)
       }

       if (join_row[p["Key"]] !~ /^[1-9,:]+$/) {
	  print "nblparser: NBL 'read': invalid schema Key field" > stderr
	  exit(rc=1)
       }

       split(join_row[p["Key"]],a,"/")

       i=0
       while (a[++i] != "") {
	 if (split(a[i],b,",") != 2) {
	    print "nblparser: NBL 'read': bad schema Key field" > stderr
	    exit(rc=1)
	 }
	 k = substr($5,b[1],b[2])
	 while (length(k) < b[2]) k = k "?"	# pad with wildcards
	 pbc = pbc "/" k
       }

       sub (/^\//,"",pbc)			# strip leading slash
       file = join_row[p["Path"]] pbc
    }
    file = file join_row[p["Ext"]]
  }

  # join column from both tables, if specified.

  if ($4 != "-") {
     if ($4 !~ /^@?[A-Za-z][A-Za-z0-9_]+$/) {
     	print "nblparser: NBL 'join': bad column name specified" > stderr
     	exit(rc=1)
     }

     # an "@" sign at the beginning of the join column requests that the
     # one of the two tables that is *not* on stdin be sorted on the
     # specified field, even if that field is the table PK. By default,
     # if the named field is the table PK it is assumed to be already
     # sorted in the table.

     if (sub(/^@/,"",$4)) sort = 1
     cmd = cmd " --column " $4
  }

  if (file ~ /\?/) {
     tbl = "$(union " file
     if (sort) {
	tbl = tbl " | sorttable" 
     	if ($4 != "-") tbl = tbl " " $4
     }
     tbl = tbl " | " tmptable_cmd ")"
  }
  else {
     getline pk < file
     close(file)

     sub(/^\001/,"",pk); sub(/\t.*/,"",pk)

     # force sorting if the specified field isn't the table PK.
     if ($4 != "-" && $4 != pk) sort = 1

     if (sort) {
	tbl = "$(sorttable --input " file
     	if ($4 != "-") tbl = tbl " " $4
	tbl = tbl " | " tmptable_cmd ")"
     }
     else tbl = file
  }

  if ($2 != "-") {
     cmd = cmd " " tbl " -"
  }
  else cmd = cmd " - " tbl

  sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\n" cmd
}


function handle_orderby(revert,		i,cmd) {

  if (revert != "") check_cmd("@order-by")
  else check_cmd("order-by")

  cmd = "sorttable"
  if (revert != "") cmd = cmd " -r"

  if ($2 == "-") NF=1			# allow "-" to mean "all columns"

  for (i=2; i<=NF; i++) {
      if ($i !~ /^[A-Za-z][A-Za-z0-9_]+(:[Mbdfinr])?$/) {
	 print "nblparser: NBL 'order-by': bad column name specified" > stderr
	 exit(rc=1)
      }
      cmd = cmd " " $i
  }
  sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\n" cmd
}


function handle_remember() {

  check_cmd("remember-as")

  if (NF != 2) {
     print "nblparser: NBL usage: remember-as name" > stderr
     exit(rc=1)
  }

  # accept only valid variable names. Lower-case is mandatory, not to
  # interfere with the usual shell environment variables.

  if ($2 !~ /^[a-z][a-z0-9_]*$/) {
     print "nblparser: NBL 'remember-as': bad name specified" > stderr
     exit(rc=1)
  }

  sh_cmd[sh_cmd[0]] = $2 "=$(\n" sh_cmd[sh_cmd[0]] " |\n" tmptable_cmd "\n)"

  delete schema_row		# a new 'read' is necessary from now on.
}


function handle_format(list) {

  if (!nohide && hide != "") {
     sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\nnotcolumn " hide
     nohide = 1			# tell END{} to hide no more.
  }

  if (list != "") {
     check_cmd("@format")
     sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\ntabletolist --justify"
  }
  else {
     check_cmd("format")
     sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\njustify"
  }

  delete schema_row		# a new 'read' is necessary from now on.
}


function handle_totals(currency,	i,a) {

  if (currency != "") {
     check_cmd("@totals")
     sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\ntotal -c -r"
  }
  else {
     check_cmd("totals")
     sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |\ntotal -r"
  }

  sub(/^@?totals[ \t]*/,"")
  split($0,a)
  while (a[++i] != "") {
     if (a[i] !~ /^[A-Za-z_][A-Za-z0-9_]*$/) {
	print "nblparser: NBL 'totals': bad column names(s) specified" > stderr
	exit(rc=1)
     }
  }

  sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " " $0
}


function handle_system() {

  if (!unsafe) {
     print "nblparser: NBL 'system': command not allowed" > stderr
     exit(rc=1)
  }

  if (NF < 2) {
     print "nblparser: NBL usage: system commands" > stderr
     exit(rc=1)
  }

  sub(/^system[ \t]*/,"")

  if (schema_row[p["Table"]] != "")
	sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] " |"
  sh_cmd[sh_cmd[0]] = sh_cmd[sh_cmd[0]] "\n" $0
}

# End of program
