/***********************************************************************
  IISA(Inverted Index with Suffix Array) Search Engine
                                    Copyright (C) 1998, Takuya NAKAYAMA
***********************************************************************/
#define VERSION "0.4"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/fcntl.h>
#include <sys/mman.h>
#include "sufary.h"
#include "ix_code.h"
#include "keys.h"
#include "strop.h"

typedef struct {
    int fd;
    caddr_t map;
    unsigned long size;
} MMAPF;  /* mmap()եξݻ */

typedef struct {
    unsigned long fid;
    float score;
    int   s0;
} SearchResult;  /* search̤ξݻ */

MMAPF npt, nx;
int*  ID_Check;
char  buf[1024];

char* INDEX_NAME = "MKIND";
int   FILE_MAX_NUM;

struct {
    char* OUTPUT_SEP;
    int   NOSORT;
    char* not_key_str;
    char* key_str;
    int   Ngram;
    char* mozdic_name;
    SUFARY* mozdic;
    int   range[2];
} Option;   /* ͡ */

/***********************************************************************
  ˤ륽Ѵؿ
***********************************************************************/
static int score_cmp (SearchResult* a, SearchResult* b)
{
    if (a->score == b->score) return 0;

    return (a->score > b->score) ? -1: 1;
}
/***********************************************************************
  եֹ梪ե̾Ѵ
***********************************************************************/
char* id2fname (unsigned long id)
{
    long *pt;

    if (id > npt.size) return NULL;
    pt = (long *)npt.map +id;
    return (char*)nx.map + *pt;
}
/***********************************************************************
  ե򥪡ץ󤷤 mmap() 
***********************************************************************/
int open_mmap_file (char* fname, MMAPF *ret)
{
    struct stat st;
    off_t sz;  /* ե륵 */
    FILE *tmp;

    /* ϥե  */
    if ( (ret->fd = open(fname, O_RDONLY)) < 0 ) {
	return 0;
    }
    fstat(ret->fd, &st);
    sz = st.st_size;
    /* ޥå */
    if ((ret->map = mmap((caddr_t)0, sz, PROT_READ, MAP_SHARED, ret->fd, 0))
	== (caddr_t)-1 ){
	return 0;
    }
    /* եΥΤ뤿ν */
    if ((tmp = fopen(fname, "r")) == NULL){
	return 0;
    }
    fseek(tmp, 0L, 2);
    ret->size = ftell(tmp) / sizeof(long);
    /* Ĥ */
    fclose(tmp);

    return 1;
}
/***********************************************************************
  롼
***********************************************************************/
int search_index (SearchResult* ret, SUFARY* sa, char* key,
		  int offset, int nots)
{
    int i, j, id, hit = 0;
    unsigned char *xpt;
    long       pos;
    IxCode     code;
    IxCodeInfo info;
    double idf;

    if (*key == '\0') return 0;

    sa_reset(sa);
    if (sa_sel(sa, key) == CONT) {
	for (i = sa_left(sa); i <= sa_right(sa) ; i++){
	    pos = sa_aryidx2txtidx(sa, i);
	    xpt = sa->txtmap + pos;

	    code.buf = xpt;
	    info = ix_decode(code);

	    for (j = 0; j < info.num; j++) {
		id = info.files[j];
		if (ID_Check[id] == offset) {
		    if (nots != 0) {
			ID_Check[id]--;
		    } else {
			ID_Check[id]++;
			if (ret != NULL) {
			    ret[hit].fid = id;
			    ret[hit].s0 = info.score[j];
			}
			hit++;
		    }
		}
	    }
	    free(info.files);
	}
    }
    if (nots < 0) {
	for (i = 0; i < FILE_MAX_NUM; i++) {
	    if (ID_Check[i] >= offset) ID_Check[i] = 0;
	}
    }
    if (ret != NULL) {
	if (nots != 0) {
	    hit = 0;
	    for (i = 0; i < FILE_MAX_NUM; i++) {
		if (ID_Check[i] >= offset) {
		    ret[hit].fid = i;
		    ret[hit].score = 0;
		    hit++;
		}
	    }
	} else {
	    idf = log((double)(FILE_MAX_NUM+1)/(hit+1))/log(2);
	    for (i = 0; i < hit; i++) {
		ret[i].score =
		    idf * ret[i].s0 + ((offset != 0)? ret[i].score:0);
	    }
	}
    }
/*    fprintf (stderr, "[%s] %d hits\n", key, hit);*/
    return hit;
}
/***********************************************************************
  оݥեμ
***********************************************************************/
int get_file_max_num (char* fname) {
    char *tmp1;
    FILE* fp;
    char buf[1000];
    int ret;

    tmp1 = str_append(fname,".num");
    if ((fp = fopen(tmp1, "r")) == NULL) {
	printf ("cannot open %s\n", tmp1);
	exit(1);
    }
    free(tmp1);

    while (fgets(buf, 1000, fp) != NULL) {
	if (*buf != '#') break;
	if (strncmp(buf, "#LAST_ID=", 9) == 0) {
	    ret = atoi(buf+9) + 1;
	    break;
	}
    }
    return ret;
}
/***********************************************************************
  إץå
***********************************************************************/
void print_help(void)
{
    printf("iisa version %s\n"\
	   "  usage: iisa [options] keyword\n"\
	   "  options:\n", VERSION);
    printf ("    -f ..     : set index name (%s)\n", INDEX_NAME);
    printf ("    -s ..     : output separator (%s)\n", Option.OUTPUT_SEP);
    printf ("    -r ..     : range of the result (%d:%d)\n",
	    Option.range[0], Option.range[1]);
    printf ("    -nosort   : output without sorting (%d)\n", Option.NOSORT);
    printf ("    -not ..   : negative key word\n");
    printf ("    -ngram .. : N-gram mode & specify N=(%d)\n", Option.Ngram);
    printf ("    -moz ..   : MOZ mode & specify mozdic=(%s)\n",
	    (Option.mozdic_name == NULL)? "": Option.mozdic_name);
    exit(1);
}
/***********************************************************************
  ޥɰν
***********************************************************************/
void parse_args(int argc, char* argv[])
{
    int i;
    /* default value */
    Option.OUTPUT_SEP = "----------------------------";
    Option.NOSORT = 0;
    Option.key_str = NULL;
    Option.not_key_str = NULL;
    Option.Ngram = 0;
    Option.mozdic_name = NULL;
    Option.mozdic = NULL;
    Option.range[0] = 0; Option.range[1] = -1;
    /* parse arguments */
    for (i = 1; i < argc; i++) {
	if (argv[i][0] != '-') {
	    Option.key_str = argv[i]; continue;
	}
	switch (argv[i][1]) {
	  case 'f':
	    INDEX_NAME = argv[++i];
	    break;
	  case 's':
	    Option.OUTPUT_SEP = argv[++i];
	    break;
	  case 'm':
	    if (strcmp("-moz", argv[i]) == 0) {
		Option.mozdic_name = argv[++i];
		if ((Option.mozdic = sa_openfiles(Option.mozdic_name, NULL))
		    == NULL) {
		    fprintf(stderr, "cannot open moz dic files\n");
		    exit(1);
		}
		break;
	    }
	  case 'n':
	    if (strcmp("-nosort", argv[i]) == 0) {
		Option.NOSORT = 1;
		break;
	    } else if (strcmp("-not", argv[i]) == 0) {
		Option.not_key_str = argv[++i];
		break;
	    } else if (strcmp("-ngram", argv[i]) == 0) {
		Option.Ngram = atoi(argv[++i]);
		break;
	    }
	  case 'r':
	    i++;
	    if (sscanf(argv[i], "%d:%d",
		       &(Option.range[0]), &(Option.range[1])) != 2)
		if (sscanf(argv[i], ":%d", &(Option.range[1])) != 1)
		    if (sscanf(argv[i], "%d:", &(Option.range[0])) != 1) {
			Option.range[0] = 0; Option.range[1] = -1;
		    } else {
			Option.range[1] = -1;
		    }
		else
		    Option.range[0] = 0;
	    if (Option.range[0] < 0) Option.range[0] = 0;
	    break;
	  default:
	    print_help();
	}
    }

}
/***********************************************************************
  ᥤ
***********************************************************************/
int main (int argc, char* argv[])
{
    SUFARY* sa;
    KeyBuffer keys;
    KeyBuffer nkeys;
    SearchResult* result;
    FILE* ifp;
    int   i,offset,id,len,hit_num, start, end;
    char  *tmp1, *tmp2;

    
    /* parse arguments */
    parse_args(argc, argv);
    if (Option.key_str == NULL) print_help();

    /* ѿ */
    FILE_MAX_NUM = get_file_max_num(INDEX_NAME);
    ID_Check = (int*)malloc(sizeof(int)*FILE_MAX_NUM);
    memset(ID_Check, 0, sizeof(int)*FILE_MAX_NUM);
    init_key_buf(&keys); init_key_buf(&nkeys);
    result = (SearchResult*)malloc(sizeof(SearchResult)*FILE_MAX_NUM);
    /*  parse */
    if (Option.Ngram > 0)
	parse_by_ngram(&keys, Option.key_str, Option.Ngram);
    else if (Option.mozdic != NULL)
	parse_by_mozdic(&keys, Option.key_str, Option.mozdic);
    else
	parse_by_chartype(&keys, Option.key_str);
    if (Option.not_key_str == NULL) {
	Option.not_key_str = "\0";
    } else {
	parse_by_chartype(&nkeys, Option.not_key_str);
    }

    /* ƼեΥץ */
    tmp1 = str_append(INDEX_NAME, ".nx");
    if (open_mmap_file(tmp1, &nx) == 0) {
	printf ("cannot open %s\n", tmp1);
	exit(1);
    }
    tmp2 = str_append(INDEX_NAME, ".npt");
    if (open_mmap_file(tmp2, &npt) == 0) {
	printf ("cannot open %s\n", tmp1);
	exit(1);
    }
    free(tmp1); free(tmp2);
    tmp1 = str_append(INDEX_NAME,".ix");
    tmp2 = str_append(INDEX_NAME,".ix.ary");
    if ((sa = sa_openfiles(tmp1, tmp2)) == NULL) {
	printf ("cannot open %s or %s\n", tmp1, tmp2);
	exit(1);
    }
    free(tmp1); free(tmp2);
    /*  */
    offset = 0;
    for (i = 0; i < nkeys.num; i++) {
	search_index(NULL, sa, nkeys.str[i], offset,
		     (i == (nkeys.num-1))? -1:1);
	offset--;
    }
    offset = 0;
    for (i = 0; i < keys.num-1; i++) {
	search_index(NULL, sa, keys.str[i], offset, 0);
	offset++;
    }
/*    printf ("key = (%d,%d) offset = %d\n", keys.num, nkeys.num, offset);*/
    if (keys.num > 0)
	hit_num = search_index(result, sa, keys.str[keys.num-1], offset, 0);
    else if (nkeys.num > 0)
	hit_num = search_index(result, sa, nkeys.str[nkeys.num-1], offset, 1);
    else
	hit_num = 0;
    
    if (!Option.NOSORT) {
	qsort((char*)result, hit_num, sizeof(SearchResult),
	      (int (*)(const void *,const void *))score_cmp);
    }

    printf(" [%s/%s] = ", Option.key_str, Option.not_key_str);
    for (i = 0; i < keys.num; i++)  printf ("%s ", keys.str[i]);
    for (i = 0; i < nkeys.num; i++) printf ("!%s ", nkeys.str[i]);
    printf("(found %d from %d files)\n", hit_num, FILE_MAX_NUM);

    start = Option.range[0];
    end = (Option.range[1] >= 0 & Option.range[1] <= hit_num)?
	Option.range[1] : (hit_num-1);
    printf("[%d-%d]\n", start, end);
    
    
    for (i = start; i <= end; i++) {
	printf ("%s %f\n", Option.OUTPUT_SEP, result[i].score);
	puts(id2fname(result[i].fid));
    }
    printf ("%s -1\n", Option.OUTPUT_SEP);
    /* ƼեΥ */
    sa_closefiles(sa);

    exit(0);
}
