/*
 *	sortdic.c - sort a dictionary
 *
 *	it does same as "sort -s +0 -1 | uniq | gawk -e '{printf("%s\0",$$0)}'"
 *
 *	by A.Kitauchi <akira-k@is.aist-nara.ac.jp>, Oct. 1996
 *
 */

#include <stdio.h>
#include <string.h>
#include "chadic.h"

#define LINEMAX 8192

typedef struct _line_info {
    long ptr;
    char *midasi;
} line_info;

#define BLOCK_SIZE (1024 * 256)
#define BLOCK_MAX 1024
static char *buffer_ptr[BLOCK_MAX];
static int  buffer_ptr_num = 0;
static int  buffer_idx = BLOCK_SIZE;

static char *cha_malloc_char(size)
    int size;
{
    if (buffer_idx + size >= BLOCK_SIZE) {
	if (buffer_ptr_num == BLOCK_MAX)
	  cha_exit(1, "Can't allocate memory");
	buffer_ptr[buffer_ptr_num++] = cha_malloc(BLOCK_SIZE);
	buffer_idx = 0;
    }

    buffer_idx += size;
    return buffer_ptr[buffer_ptr_num - 1] + buffer_idx - size;
}

static int ustrcmp(s1, s2)
    unsigned char *s1, *s2;
{
    for (; *s1 && *s1 == *s2; s1++, s2++);
    return (int)(*s1 - *s2);
}

static int midasi_compare(l1, l2)
    line_info *l1, *l2;
{
#if 0
    return ustrcmp(l1->midasi, l2->midasi) || (int)(l1->ptr - l2->ptr);
#else
    int rc;
    rc = ustrcmp(l1->midasi, l2->midasi);
    if (rc)
      return rc;
    else
      return (int)(l1->ptr - l2->ptr);
#endif
}

static void sortdic(infile, outfile)
    char *infile, *outfile;
{
    FILE *fpi, *fpo;
    struct stat st;
    line_info *line;
    int nline, l;
    char buf[LINEMAX], *filebuf, *fb, *s;

    fpi = cha_fopen(infile, "r", 1);
    fpo = outfile ? cha_fopen(outfile, "w", 1) : stdout;

    fstat(fileno(fpi), &st);

#if 1
    fb = filebuf = malloc(st.st_size);
#else
    fb = filebuf = NULL;
#endif
    if (filebuf != NULL) {
#if 1
	fprintf(stderr, "reading... ");
#endif
	/* note: sizeof(buf) is not incorrect */
	for (nline = 0; fgets(fb, sizeof(buf), fpi) != NULL; nline++)
	  fb = (char *)memchr(fb, '\n', sizeof(buf)) + 1;
    } else {
	/* cannot malloc */
#if 1
	fprintf(stderr, "counting lines... ");
#endif
	for (nline = 0; fgets(buf, sizeof(buf), fpi) != NULL; nline++);
    }

#if 1
    fprintf(stderr, "(%d lines) ", nline);
#endif
    line = (line_info *)cha_malloc(sizeof(line_info) * nline);

    if (filebuf) {
	fb = filebuf;
	for (l = 0; l < nline; l++) {
	    line[l].midasi = fb;
	    line[l].ptr = l;
#ifndef VGRAM
	    if ((s = strchr(fb, '\t')) == NULL)
	      cha_exit(1, "%s:%d: does not contain TAB characters.", infile, l + 1);
	    *s = '\0';
#endif
	    fb = (char *)memchr(fb, '\n', sizeof(buf)) + 1;
	}
    } else {
	/* cannot malloc */
#if 1
	fprintf(stderr, "reading... ");
#endif
	rewind(fpi);
	for (l = 0; l < nline; l++) {
	    line[l].ptr = ftell(fpi);
	    fgets(buf, sizeof(buf), fpi);
#ifndef VGRAM
	    if ((s = strchr(buf, '\t')) == NULL)
	      cha_exit(1, "%s:%d: does not contain TAB characters.", infile, l + 1);
	    *s = '\0';
#endif
	    line[l].midasi = cha_malloc_char(strlen(buf) + 1);
	    strcpy(line[l].midasi, buf);
	}
    }

#if 1
    fprintf(stderr, "sorting... ");
#endif
    qsort(line, nline, sizeof(line_info), (int (*)())midasi_compare);

#if 1
    fprintf(stderr, "writing... ");
#endif

    if (filebuf) {
	char *b, *pre = "";
	int len;
	for (l = 0; l < nline; l++) {
	    b = line[l].midasi;
#ifndef VGRAM
	    b[strlen(b)] = '\t';
#endif
	    len = (char *)memchr(b, '\n', sizeof(buf)) + 1 - b;
	    if (memcmp(pre, b, len)) {
#ifdef VGRAM
		fwrite(b, len, 1, fpo);
#else
		fwrite(b, len-1, 1, fpo);
		fputc('\0', fpo);
#endif
		pre = b;
	    }
	}
    } else {
	char prebuf[LINEMAX];
	int len;
	prebuf[0] = '\0';
	for (l = 0; l < nline; l++) {
	    fseek(fpi, line[l].ptr, SEEK_SET);
	    fgets(buf, sizeof(buf), fpi);
	    len = (char *)memchr(buf, '\n', sizeof(buf)) + 1 - buf;
	    if (memcmp(prebuf, buf, len)) {
#ifdef VGRAM
		fwrite(buf, len, 1, fpo);
#else
		fwrite(buf, len-1, 1, fpo);
		fputc('\0', fpo);
#endif
		memcpy(prebuf, buf, len);
	    }
	}
    }
    fclose(fpi);
    fclose(fpo);

    fprintf(stderr, "done.\n");
}

int main(argc, argv)
    int argc;
    char *argv[];
{
    time_t t0, t1;

    FILE *fpi, *fpo;

    if (argc < 2) {
	fprintf(stderr, "usage: sortdic input-file [ output-file ]\n");
	exit(1);
    }

    set_progpath(argv[0]);

    time(&t0);
    sortdic(argv[1], argv[2]);
    time(&t1);
    fprintf(stderr, "processing time: %d sec\n", (int)(t1 - t0));

    return 0;
}
