/*
 *	Sherlock Indexer -- Sorting of Word Index
 *
 *	(c) 2001--2003 Martin Mares <mj@ucw.cz>
 *	(c) 2005 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/getopt.h"
#include "lib/fastbuf.h"
#include "lib/unaligned.h"
#include "lib/mempool.h"
#include "lib/unicode.h"
#include "lib/ff-utf8.h"
#include "indexer/indexer.h"
#include "indexer/lexicon.h"
#include "indexer/params.h"

#include <stdlib.h>

#define SORT_KEY u32
#define SORT_PREFIX(x) ws_##x
#define SORT_UNIFY
#define SORT_DELETE_INPUT sort_delete_src
#define SORT_INPUT_FILE
#define SORT_OUTPUT_FB

static inline int
ws_compare(u32 *xx, u32 *yy)
{
  COMPARE(*xx, *yy);
  return 0;
}

static inline int
ws_fetch_key(struct fastbuf *f, u32 *x)
{
  *x = bgetl(f);
  return *x != ~0U;
}

static inline void
ws_copy_data(struct fastbuf *src, struct fastbuf *dest, u32 *x)
{
  uns len = bgetl(src);

  bputl(dest, *x);
  bputl(dest, len);
  bbcopy(src, dest, len);
}

#include "indexer/refmerger.h"

static void
ws_merge_data(struct fastbuf *src1, struct fastbuf *src2, struct fastbuf *dest, u32 *x, u32 *y UNUSED)
{
  bputl(dest, *x);
  refchain_merge(src1, src2, dest);
}

#include "lib/sorter.h"

static void
split(struct fastbuf *sorted)
{
  struct fastbuf *lex_tmp, *lex, *refs;
  u32 wid_lex;
  uns wid_ref, wlen, wcount;
  enum word_class wclass;
  byte word[MAX_WORD_LEN+1];

  lex_tmp = index_bopen(fn_lex_ordered, O_RDONLY);
  refs = index_bopen(fn_references, O_WRONLY | O_CREAT | O_APPEND);
  lex = index_bopen(fn_lex_words, O_WRONLY | O_CREAT | O_TRUNC);
  wid_ref = bgetl(sorted);
  wid_lex = 1;
  wcount = bgetl(lex_tmp);
  bputl(lex, wcount);
  while (wid_lex <= wcount)
    {
      u32 in_id = bgetl(lex_tmp);
      ASSERT(in_id/8 == wid_lex);
      uns wfreq = bgetl(lex_tmp);
      wclass = in_id & 7;
      uns ctxt = bget_context(lex_tmp);
      wlen = bgetc(lex_tmp);
      breadb(lex_tmp, word, wlen);
      bputo(lex, btell(refs));
      if (wid_lex >= wid_ref)
	{
	  uns rsize;
	  rsize = bgetl(sorted);
	  if (rsize >= 0x10000000)
	    {
	      word[wlen] = 0;
	      die("Reference chain #%d for word %s too long, maximum is 256MB. Ask a wizard to enlarge me.", wid_ref, word);
	    }
	  bputw(lex, (rsize + 4 + 0xfff) >> 12U);
	  while (rsize)
	    rsize -= 4 + bbcopy_chain(sorted, refs, bgetl(sorted));
	  bputl(refs, 0);
	  wid_ref = bgetl(sorted);
	}
      else
	bputw(lex, 0);
      bputc(lex, wclass);
#ifdef CONFIG_SPELL
      bputc(lex, wfreq);
#endif
      bput_context(lex, ctxt);
      bputc(lex, wlen);
      bwrite(lex, word, wlen);
      wid_lex++;
    }
  ASSERT(wid_ref = 0xffffffff);
  bclose(lex_tmp);
  bclose(refs);
  bclose(lex);
}

int
main(int argc, char **argv)
{
  struct fastbuf *sorted;

  log_init(argv[0]);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  log(L_INFO, "Sorting word index");
  sorted = ws_sort(index_name(fn_word_index));
  log(L_INFO, "Splitting word index");
  split(sorted);
  bclose(sorted);
  return 0;
}
