/*
 *	Sherlock Indexer -- Final Lexicon Sorting
 *
 *	(c) 2001--2003 Martin Mares <mj@ucw.cz>
 *
 *	This module sorts all words to an order convenient for the search server.
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/fastbuf.h"
#include "lib/unaligned.h"
#include "lib/mempool.h"
#include "lib/unicode.h"
#include "charset/unicat.h"
#include "indexer/indexer.h"
#include "indexer/lexicon.h"

#include <stdlib.h>
#include <fcntl.h>

static struct lex_entry **word_array, **word_array_orig;
static uns n_words, n_cplx;	/* Here words include complexes */
static uns *ctxt_counter;

#define ASORT_PREFIX(x) word_##x
#define ASORT_KEY_TYPE struct lex_entry *
#define ASORT_LT(x,y) word_lt(x,y)
#define ASORT_ELT(i) word_array[i]

static int
word_lt(struct lex_entry *x, struct lex_entry *y)
{
#ifdef CONFIG_CONTEXTS
  if (x->class == WC_COMPLEX)
    {
      if (y->class == WC_COMPLEX)
	{
	  uns xt = GET_CONTEXT(&x->ctxt);
	  uns yt = GET_CONTEXT(&y->ctxt);
	  if (xt < yt)
	    return 1;
	  if (xt > yt)
	    return 0;
	  uns xi = GET_U16(x+1);	/* Trick: we store the order just after the lex_entry */
	  uns yi = GET_U16(y+1);
	  if (xi < yi)
	    return 1;
	  if (xi > yi)
	    return 0;
	  ASSERT(xi == yi);
	  return 0;
	}
      else
	return 0;
    }
  else if (y->class == WC_COMPLEX)
    return 1;
#endif

  byte *xx = x->w;
  byte *yy = y->w;
  byte *xend = xx + x->length;
  byte *yend = yy + y->length;
  uns xu, yu, xuu, yuu;
  int pass1 = -1, pass2 = -1;

  while (xx < xend && yy < yend)
    {
      GET_UTF8(xx, xu);
      GET_UTF8(yy, yu);
      xuu = Uunaccent(xu);
      yuu = Uunaccent(yu);
      if (pass1 < 0)
	{
	  if (xuu < yuu)
	    pass1 = 1;
	  else if (xuu > yuu)
	    pass1 = 0;
	}
      if (pass2 < 0)
	{
	  if (xu < yu)
	    pass2 = 1;
	  else if (xu > yu)
	    pass2 = 0;
	}
    }
  if (yy < yend)
    return 1;
  if (xx < xend)
    return 0;
  if (pass1 >= 0)
    return pass1;
  if (pass2 >= 0)
    return pass2;
  ASSERT(x == y);
  return 0;
}

#include "lib/arraysort.h"

#ifdef CONFIG_LANG
static u32 *xlat_table;  /* Shares space with word_array_orig */

static inline uns
id_renumber(uns id)
{
  uns i = id/8 - 1;
  ASSERT(i < n_words && xlat_table[i]);
  return xlat_table[i] | (id&7);
}

struct stem_pair {
  u32 stem, derived;
};

static struct stem_pair *stem_array;

#define ASORT_PREFIX(x) stem_##x
#define ASORT_KEY_TYPE struct stem_pair
#define ASORT_LT(x,y) stem_lt(x,y)
#define ASORT_ELT(i) stem_array[i]

static int
stem_lt(struct stem_pair x, struct stem_pair y)
{
  return x.stem < y.stem || x.stem == y.stem && x.derived < y.derived;
}

#include "lib/arraysort.h"

static void
stems_renumber(void)
{
  struct fastbuf *in = index_bopen(fn_stems_ordered, O_RDONLY);
  struct fastbuf *out = index_bopen(fn_stems, O_WRONLY|O_CREAT|O_TRUNC);
  uns i, j, stid, lama, x;
  while ((stid = bgetl(in)) != ~0U)
    {
      lama = bgetl(in);
      bputl(out, stid);
      bputl(out, lama);
      sh_off_t start_pos = btell(in);
      while ((x = bgetl(in)) != ~0U)
	bgetl(in);
      uns cnt = (btell(in) - start_pos) / 8;
      stem_array = xmalloc(sizeof(struct stem_pair) * cnt);
      bsetpos(in, start_pos);
      for (i=0; i<cnt; i++)
	{
	  stem_array[i].stem = bgetl(in);
	  if (stid != 0x80000000)
	    stem_array[i].stem = id_renumber(stem_array[i].stem);
	  stem_array[i].derived = bgetl(in);
	  if (stid != 0x80000001)
	    stem_array[i].derived = id_renumber(stem_array[i].derived);
	}
      bgetl(in);
      stem_sort(cnt);
      i = 0;
      while (i < cnt)
	{
	  j = i;
	  while (j < cnt && stem_array[j].stem == stem_array[i].stem)
	    j++;
	  bputl(out, 0x80000000 | stem_array[i].stem);
	  while (i < j)
	    bputl(out, stem_array[i++].derived);
	}
      bputl(out, ~0U);
      xfree(stem_array);
    }
  bclose(in);
  bclose(out);
}
#endif

int
main(int argc, char **argv)
{
  log_init(argv[0]);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  /* Read the input */
  struct mempool *pool = mp_new(65536);
  struct fastbuf *in = index_bopen(fn_lex_words, O_RDONLY);
  n_words = bgetl(in);
  log(L_INFO, "Reading lexicon: %d words", n_words);
  word_array_orig = xmalloc(n_words * sizeof(struct lex_entry *));
  ctxt_counter = xmalloc_zero(lex_context_slots * sizeof(ctxt_counter[0]));
  for (uns i=0; i<n_words; i++)
    {
      struct lex_entry le, *e;
      breadb(in, &le, sizeof(le));
      if (le.class == WC_COMPLEX)
	{
	  /* We put the context slot after the lex_entry as an u16
	   * (beware, it doesn't need to fit in a context_t!)
	   */
	  uns ctxt = GET_CONTEXT(&le.ctxt);
	  e = mp_alloc_fast_noalign(pool, sizeof(le) + 2);
	  PUT_U16(e+1, ctxt_counter[ctxt]);
	  ctxt_counter[ctxt]++;
	  n_cplx++;
	}
      else
	e = mp_alloc_fast_noalign(pool, sizeof(le) + le.length);
      memcpy(e, &le, sizeof(le));
      breadb(in, e->w, le.length);
      word_array_orig[i] = e;
    }
  bclose(in);

  /* Prepare output file */
  struct fastbuf *out = index_bopen(fn_lexicon, O_WRONLY|O_CREAT|O_TRUNC);
  bputl(out, n_words - n_cplx);
  bputl(out, n_cplx);

  /* Sort and dump words */
  log(L_INFO, "Sorting words");
  word_array = xmalloc(sizeof(struct lex_entry *) * n_words);
  memcpy(word_array, word_array_orig, sizeof(struct lex_entry *) * n_words);
  word_sort(n_words);
  for (uns i=0; i<n_words; i++)
    {
      bwrite(out, word_array[i], sizeof(struct lex_entry) + word_array[i]->length);
      PUT_U32(word_array[i]->ref_pos, 8*i+8);  /* Misuse ref_pos for new word ID */
    }
  bclose(out);

#ifdef CONFIG_LANG
  /* Renumber, sort and dump stem expansions */
  log(L_INFO, "Sorting stem expansions");
  xlat_table = (u32 *) word_array;
  for (uns j=0; j<n_words; j++)
    xlat_table[j] = GET_U32(word_array_orig[j]->ref_pos);
  xfree(word_array_orig);
  stems_renumber();
#endif

  return 0;
}
