/*
 *	Sherlock Indexer -- Sorting of String Index
 *
 *	(c) 2001 Martin Mares <mj@ucw.cz>
 *	(c) 2005 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/getopt.h"
#include "lib/fastbuf.h"
#include "lib/unicode.h"
#include "lib/ff-utf8.h"
#include "indexer/indexer.h"
#include "indexer/params.h"

#include <stdlib.h>

struct ssk {
  struct fingerprint fp;
};

#define SORT_KEY struct ssk
#define SORT_PREFIX(x) ss_##x
#define SORT_UNIFY
#define SORT_DELETE_INPUT sort_delete_src
#define SORT_INPUT_FILE
#define SORT_OUTPUT_FB

static inline int
ss_compare(struct ssk *x, struct ssk *y)
{
  return memcmp(&x->fp, &y->fp, sizeof(struct fingerprint));
}

static inline int
ss_fetch_key(struct fastbuf *f, struct ssk *x)
{
  return breadb(f, &x->fp, sizeof(struct fingerprint));
}

static inline void
ss_copy_data(struct fastbuf *src, struct fastbuf *dest, struct ssk *x)
{
  uns cnt = bgetl(src);

  bwrite(dest, &x->fp, sizeof(struct fingerprint));
  bputl(dest, cnt);
  bbcopy(src, dest, cnt);
}

#include "indexer/refmerger.h"

static void
ss_merge_data(struct fastbuf *src1, struct fastbuf *src2, struct fastbuf *dest, struct ssk *x, struct ssk *y UNUSED)
{
  bwrite(dest, &x->fp, sizeof(struct fingerprint));
  refchain_merge(src1, src2, dest);
}

#include "lib/sorter.h"

static uns
split(struct fastbuf *sorted)
{
  struct fastbuf *smap, *refs;
  struct ssk key;
  uns entries = 0;

  smap = index_bopen(fn_string_map, O_WRONLY | O_CREAT | O_TRUNC);
  refs = index_bopen(fn_references, O_WRONLY | O_CREAT | O_APPEND);

  bseek(refs, 0, SEEK_END);
  while (ss_fetch_key(sorted, &key))
    {
      uns rsize;
      bwrite(smap, &key.fp, sizeof(key.fp));
      bputo(smap, btell(refs));
      rsize = bgetl(sorted);
      while (rsize)
	rsize -= 4 + bbcopy_chain(sorted, refs, bgetl(sorted));
      bputl(refs, 0);
      entries++;
      if (unlikely(!entries))
	die("Too many strings indexed. Try decreasing Chewer.StringMax as a work-around.");
    }
  memset(&key.fp, 255, sizeof(key.fp));
  bwrite(smap, &key.fp, sizeof(key.fp));
  bputo(smap, btell(refs));

  bclose(smap);
  bclose(refs);
  log(L_INFO, "Indexed %d strings", entries);
  return entries;
}

static void
mk_hash(uns cnt)
{
  uns shift = 1;
  struct fastbuf *smap, *shash;
  uns bsize, bsmax;
  int buck, nbuck;
  struct fingerprint fp;
  sh_off_t pos;
  u32 hh;

  while ((cnt >> shift) > string_avg_bucket)
    shift++;
  smap = index_bopen(fn_string_map, O_RDONLY);
  shash = index_bopen(fn_string_hash, O_WRONLY | O_CREAT | O_TRUNC);

  nbuck = 1 << shift;
  buck = -1;
  bsize = 0;
  bsmax = 0;
  for(;;)
    {
      pos = btell(smap) / (sizeof(struct fingerprint) + BYTES_PER_O);
      if (!breadb(smap, &fp, sizeof(fp)))
	break;
      bskip(smap, BYTES_PER_O);
      hh = fp_hash(&fp) >> (32 - shift);
      while (buck < (int) hh)
	{
	  if (bsize > bsmax)
	    bsmax = bsize;
	  bsize = 0;
	  bputl(shash, pos);
	  buck++;
	}
      bsize++;
    }
  if (bsize > bsmax)
    bsmax = bsize;
  while (buck < nbuck)			/* one more as "last" marker due to buck starting at -1 */
    {
      bputl(shash, pos);
      buck++;
    }

  bclose(smap);
  bclose(shash);
  log(L_INFO, "Hashed string references to %d buckets, %d entries/bucket max", nbuck, bsmax);
}

int
main(int argc, char **argv)
{
  struct fastbuf *sorted;
  uns cnt;

  log_init(argv[0]);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  log(L_INFO, "Sorting string index");
  sorted = ss_sort(index_name(fn_string_index));
  log(L_INFO, "Splitting string index");
  cnt = split(sorted);
  bclose(sorted);
  mk_hash(cnt);
  return 0;
}
