/* File "patterns.c":
 * Malaga pattern matching. */

/* This file is part of Malaga, a system for Left Associative Grammars.
 * Copyright (C) 1995-1998 Bjoern Beutel
 *
 * Bjoern Beutel
 * Universitaet Erlangen-Nuernberg
 * Abteilung fuer Computerlinguistik
 * Bismarckstrasse 12
 * D-91054 Erlangen
 * e-mail: malaga@linguistik.uni-erlangen.de 
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "basic.h"

#undef GLOBAL
#define GLOBAL

#include "patterns.h"

/* constants ================================================================*/

#define PATTERN_MAX 500        /* maximum size of a pattern */
#define PATTERN_STACK_MAX 50   /* maximum size of pattern stack */
#define PATTERN_STRING_MAX 500 /* maximum length of string to be matched */

/* These are the instructions for matching a pattern.
 *
 * A pattern is a 0-terminated sequence of CHARs, defined as follows:
 * C[] is the code vector.
 * PC means pattern counter (pointer to char following a PAT_ instruction).
 * S[] is the string to be examined.
 * I is the index into the string.
 * CS[] is the code stack and IS[] is the string index stack.
 * SP is the stack pointer (same for RS[] and IS[]).
 * VS[] and VE[] are vectors to store the index of start and end of vars. */
enum
{
  PAT_ACCEPT,          /* define strings and return with TRUE */
  PAT_JUMP,            /* PC += (byte_t) C[PC]; */      
  PAT_JUMP_NOW,        /* SP++; CS[SP] = PC+1; IS[SP] = I; */
                       /* PC += (byte_t) C[PC]; */
  PAT_JUMP_LATER,      /* SP++; CS[SP] = PC + (byte_t) C[PC]; */
                       /* IS[SP] = I; PC++; */
  PAT_MATCH_ANY,       /* if S[I] != EOS then I++; else fail; */
  PAT_MATCH_CLASS,     /* if (S[I] in {C[PC+1],..,C[PC+C[PC]})
		        *   {I++; PC += C[PC]+1;}
		        * else
                        *   fail; */ 
  PAT_MATCH_NOT_CLASS, /* if (S[I] in {C[PC+1],..,C[PC+C[PC]})
		        *   fail;
		        * else
		        *  {I++; PC += C[PC]+1;} */
  PAT_START_VAR_0,     /* VS[0] = I; */
  PAT_START_VAR_1,     /* VS[1] = I; */
  PAT_START_VAR_2,     /* VS[2] = I; */
  PAT_START_VAR_3,     /* VS[3] = I; */
  PAT_START_VAR_4,     /* VS[4] = I; */
  PAT_END_VAR_0,       /* VE[0] = I; */
  PAT_END_VAR_1,       /* VE[1] = I; */
  PAT_END_VAR_2,       /* VE[2] = I; */
  PAT_END_VAR_3,       /* VE[3] = I; */
  PAT_END_VAR_4        /* VE[4] = I; */
  /* all other codes must match literally. */
};


LOCAL char pattern_buffer[PATTERN_MAX]; /* buffer for compiled pattern */
LOCAL short_t pattern_length; /* length of compiled pattern */


/* functions ================================================================*/

LOCAL char pattern_char (string_t *string_ptr)
/* See if *<string_ptr> points to a valid char or to an escape sequence.
 * Return the character code. Show an error if not valid. */
{
  string_t s = *string_ptr;
  char c;

  if (s[0] == '\\' && ORD (s[1]) >= ORD (' ')) 
  {
    c = TO_LOWER (s[1]);
    s += 2;
  } 
  else if (ORD (s[0])>= ORD (' ') 
	   && strchr ("*\"?+-.^\\[]|.", s[0]) == NULL) 
  {
    c = TO_LOWER (s[0]);
    s++;
  } 
  else 
  {
    error ("invalid char \"%c\" in pattern", s[0]);
    c = 0;
  }
  
  *string_ptr = s;
  return c;
}

/*---------------------------------------------------------------------------*/

LOCAL void add_to_pattern (char c)
/* Enter another character into the pattern that is just compiled. */
{
  if (pattern_length >= PATTERN_MAX)
    error ("pattern too complex");

  pattern_buffer[pattern_length++] = c;
}

/*---------------------------------------------------------------------------*/

LOCAL char distance (short_t from, short_t to)
/* Return the distance of two indices as a char */
{
  short_t d = to - from;

  if (d > 127 || d < -128)
    error ("pattern too complex");

  return (char) d;
}

/*---------------------------------------------------------------------------*/

LOCAL void insert_in_pattern (short_t n, short_t index)
/* Enter space for <n> chars at <pattern_buffer>[<index>] 
 * by moving the trailing chars to a higher position. */
{
  short_t i;

  if (pattern_length + n > PATTERN_MAX)
    error ("pattern too complex");

  /* Move the chars beginning from the end (else we would destroy the data). */
  i = pattern_length;
  while (i > index) 
  {
    i--;
    pattern_buffer[i+n] = pattern_buffer[i]; 
  }
  pattern_length += n;
}

/*---------------------------------------------------------------------------*/

LOCAL void compile_char_class (string_t *string_ptr)
/* Compile a character class. (to be called from "compile_atom") */
{
  string_t s = *string_ptr;
  short_t char_class = pattern_length; /* beginning of the char class */
  
  if (*s == '^') 
  {
    s++;
    add_to_pattern (PAT_MATCH_NOT_CLASS);
  } 
  else
    add_to_pattern (PAT_MATCH_CLASS);
  
  add_to_pattern (0); /* class length - will be fixed later */
  do /* Read chars and ranges. */
  {
    u_byte_t c, ce;
    
    c = ORD (pattern_char (&s));
    if (*s == '-') 
    {
      s++;
      ce = ORD (pattern_char (&s));
      if (c > ce)
	error ("invalid range \"%c-%c\" in pattern", (char) c, (char) ce);
      
      while (c <= ce) 
      {
	add_to_pattern ((char) c);
	c++;
      }
    }
    else 
      add_to_pattern ((char) c);
    
  } while (*s != ']');
  
  s++;
  
  /* Fix class length */
  pattern_buffer[char_class+1] = distance (char_class+2, pattern_length);

  *string_ptr = s;
}

/*---------------------------------------------------------------------------*/

LOCAL void compile_atom (string_t *string_ptr)
/* Compile an atom and an optional postfix operator.
 * (to be called from "compile_pattern". */
{
  string_t s = *string_ptr;
  short_t last_atom = pattern_length;  /* last atom for "*", "+" and "?" */


  /* Read an atom. */
  if (*s == '[') 
  {
    s++;
    compile_char_class (&s);
  } 
  else if (*s == '.') 
  {
    s++;
    add_to_pattern (PAT_MATCH_ANY);
  } 
  else 
  {
    char c = pattern_char (&s);
    add_to_pattern (c);
  }

  /* There may be a postfix operator following an atom. */
  if (*s == '?') 
  { 
    s++;
    insert_in_pattern (2, last_atom); /* insert 2 chars at last atom. */
    pattern_buffer[last_atom] = PAT_JUMP_NOW;
    pattern_buffer[last_atom+1] = distance (last_atom+1, pattern_length);
  } 
  else if (*s == '*') 
  {
    s++;
    add_to_pattern (PAT_JUMP_LATER);
    add_to_pattern (distance (pattern_length, last_atom));
    insert_in_pattern (2, last_atom); /* this changes pattern_length */
    pattern_buffer[last_atom] = PAT_JUMP_NOW;
    pattern_buffer[last_atom + 1] = distance (last_atom + 1, pattern_length);
  } 
  else if (*s == '+') 
  {
    s++;
    add_to_pattern (PAT_JUMP_LATER);
    add_to_pattern (distance (pattern_length, last_atom));
  }

  *string_ptr = s;
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t compile_pattern (string_t string)
/* Convert <string> to a pattern to be used as input to "match_pattern".
 * The result pattern must be freed with "free" after usage. */
{
  short_t last_alternative; /* the beginning of the last alternative */
  bool_t found_alternative = FALSE;
  string_t s = string;

  last_alternative = pattern_length = 0;
  while (*s != EOS) 
  {
    if (*s == '|') 
    {
      s++;
      /* The next jump jumps behind all alternatives, has to be fixed. */
      add_to_pattern (PAT_JUMP);
      add_to_pattern (0);
      insert_in_pattern (2, last_alternative);
      pattern_buffer[last_alternative] = PAT_JUMP_LATER;
      pattern_buffer[last_alternative+1] = distance (last_alternative+1, 
						     pattern_length);
      if (found_alternative) /* Fix jump of last alternative. */
	pattern_buffer[last_alternative-1] = distance (last_alternative-1,
						       pattern_length - 2);

      last_alternative = pattern_length;
      found_alternative = TRUE;
    }
    else
      compile_atom (&s);
  }
  
  if (found_alternative)
    /* Fix jump of last alternative. */
    pattern_buffer[last_alternative-1] = distance (last_alternative-1, 
						   pattern_length);
  
  add_to_pattern (PAT_ACCEPT);
  return new_string (pattern_buffer);
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t mark_var_pattern (string_t pattern, byte_t pattern_var_no)
/* Mark the pattern so the string matching this pattern will be stored in
 * <pattern_var[pattern_var_no]>, and free <pattern>.
 * The result pattern must be freed with "free" after use. */
{
  string_t result, src, dest;

  dest = result = new_mem (1 + strlen (pattern) + 1 + 1);
  *dest++ = PAT_START_VAR_0 + pattern_var_no;
  for (src = pattern; *src != PAT_ACCEPT; src++) 
    *dest++ = *src;
  *dest++ = PAT_END_VAR_0 + pattern_var_no;
  *dest++ = PAT_ACCEPT;
  free (pattern);
  return result;
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t join_alternatives (string_t pattern1, string_t pattern2)
/* Join <pattern1> and <pattern2> to form an alternative.
 * <pattern1> and <pattern2> will be freed.
 * The result pattern must be freed with "free" after use. */
{
  string_t pattern, src, dest;

  dest = pattern = new_mem (2 + strlen (pattern1) + 2 + strlen (pattern2) + 1);
  *dest++ = PAT_JUMP_LATER;
  *dest++ = distance (0, strlen (pattern1) + 3);
  for (src = pattern1; *src != PAT_ACCEPT; src++) 
    *dest++ = *src;
  *dest++ = PAT_JUMP;
  *dest++ = distance (0, strlen (pattern2) + 1);
  for (src = pattern2; *src != PAT_ACCEPT; src++) 
    *dest++ = *src;
  *dest++ = PAT_ACCEPT;
  free (pattern1);
  free (pattern2);
  return pattern;
}

/*---------------------------------------------------------------------------*/

GLOBAL string_t join_concatenation (string_t pattern1, string_t pattern2)
/* Concat <pattern1> and <pattern2>.
 * <pattern1> and <pattern2> will be freed.
 * The result pattern must be freed with "free" after use. */
{
  string_t pattern;

  pattern = concat_strings (pattern1, pattern2, NULL);
  free (pattern1);
  free (pattern2);
  return pattern;
}

/*---------------------------------------------------------------------------*/

GLOBAL bool_t match_pattern (string_t string, string_t pattern)
/* Test whether <string> matches <pattern> (a string of chars compiled with
 * "compile_pattern") and set substring indices in <pattern_var>.
 * The substrings remain valid till "compile_pattern" is called again. */
{
  struct {string_t string, pattern;} stack[PATTERN_STACK_MAX]; 
  /* backup stack */

  short_t sp;
  bool_t found_mismatch;
  static char s[PATTERN_STRING_MAX+1];

  sp = 0;
  found_mismatch = FALSE;

  /* Create a copy of the string as a buffer
   * (for variables defined as part of the pattern). */
  if (strlen (string) >= PATTERN_STRING_MAX)
    error ("string is too long for pattern matching");
  strcpy (s, string);
  string = s;
  
  while (! found_mismatch) 
  {
    char code = *pattern++;
    
    switch (code) 
    {
    case PAT_ACCEPT:
      if (*string == EOS)
	return TRUE;
      else
	found_mismatch = TRUE;
      break;

    case PAT_JUMP:
      pattern = pattern + (byte_t) *pattern;
      break;
      
    case PAT_JUMP_NOW:
      if (sp == PATTERN_STACK_MAX)
	error ("match pattern is too complex");
      
      stack[sp].string = string;
      stack[sp].pattern = pattern + 1;
      sp++;
      pattern = pattern + (byte_t) *pattern;
      break;
      
    case PAT_JUMP_LATER:
      if (sp == PATTERN_STACK_MAX)
	error ("match pattern is too complex");
      
      stack[sp].string = string;
      stack[sp].pattern = pattern + (byte_t) *pattern;
      sp++;
      pattern++;
      break;
      
    case PAT_MATCH_ANY:
      if (*string == EOS)
	found_mismatch = TRUE;
      else
	string++;
      break;
      
    case PAT_MATCH_CLASS:
      if (*string == EOS)
	found_mismatch = TRUE;
      else 
      {
	string_t index;
	
	index = pattern + 1;
	pattern += (byte_t) *pattern + 1;
	while (index < pattern && TO_LOWER (*string) != *index)
	  index++;
	
	if (index >= pattern)
	  found_mismatch = TRUE;
	else
	  string++;
      }
      break;
      
    case PAT_MATCH_NOT_CLASS:
      if (*string == EOS)
	found_mismatch = TRUE;
      else
      {
	string_t index;
	
	index = pattern + 1;
	pattern += (byte_t) *pattern + 1;
	while (index < pattern && TO_LOWER (*string) != *index)
	  index++;
	
	if (index < pattern)
	  found_mismatch = TRUE;
	else
	  string++;
      }
      break;
      
    case PAT_START_VAR_0:
    case PAT_START_VAR_1:
    case PAT_START_VAR_2:
    case PAT_START_VAR_3:
    case PAT_START_VAR_4:
      pattern_var[code - PAT_START_VAR_0].start = string;
      break;
      
    case PAT_END_VAR_0:
    case PAT_END_VAR_1:
    case PAT_END_VAR_2:
    case PAT_END_VAR_3:
    case PAT_END_VAR_4:
      pattern_var[code - PAT_END_VAR_0].end = string;
      break;
      
    default:
      if (code != TO_LOWER (*string))
	found_mismatch = TRUE;
      else
	string++;
    }
    
    /* If this path was not successful and there is another path, try it. */
    if (found_mismatch && sp > 0) 
    {
      sp--;
      string = stack[sp].string;
      pattern = stack[sp].pattern;
      found_mismatch = FALSE;
    }
  }
  
  return FALSE;
}
