/*
 *	Sherlock Indexer -- Processing of Reference Texts
 *
 *	(c) 2002--2006 Martin Mares <mj@ucw.cz>
 *	(c) 2004 Robert Spalek <robert@ucw.cz>
 */

#undef LOCAL_DEBUG

#include "sherlock/sherlock.h"
#include "lib/getopt.h"
#include "lib/fastbuf.h"
#include "lib/heap.h"
#include "lib/mempool.h"
#include "lib/unicode.h"
#include "lib/bbuf.h"
#include "lib/math.h"
#include "lib/url.h"
#include "sherlock/object.h"
#include "charset/unicat.h"
#include "indexer/indexer.h"
#include "indexer/merges.h"

#include <stdlib.h>
#include <fcntl.h>
#include <alloca.h>

static uns
rt_hf(byte *x)
{
  uns h = 0;
  uns u;
  uns lastsp = 1;

  for(;;)
    {
      GET_UTF8(x, u);
      if (!u)
	break;
      if (Ualnum(u))
	{
	  if (lastsp)
	    h = 67*h + ' ';
	  h = 67*h + Utoupper(u);
	  lastsp = 0;
	}
      else
	lastsp = 1;
    }
  return h;
}

struct rt_fp {
  struct card_print fp;
  u32 hash;
  u16 index;
} PACKED;

static sh_off_t fpsort_bs;

#define SORT_KEY struct rt_fp
#define SORT_PREFIX(x) rt_fp_##x
#define SORT_PRESORT
#define SORT_INPUT_FB
#define SORT_OUTPUT_FB
#define SORT_REGULAR
#define SORT_UP_TO fpsort_bs

static int
rt_fp_compare(struct rt_fp *x, struct rt_fp *y)
{
  return memcmp(&x->fp.fp, &y->fp.fp, sizeof(struct fingerprint));
}

#include "lib/sorter.h"

struct rt_id {
  struct resolve_output res;
  u32 hash;
  u16 index;
} PACKED;

#define SORT_KEY struct rt_id
#define SORT_PREFIX(x) rt_id_##x
#define SORT_PRESORT
#define SORT_INPUT_FB
#define SORT_OUTPUT_FB
#define SORT_REGULAR

static int
rt_id_compare(struct rt_id *x, struct rt_id *y)
{
  uns a = merges[merges[x->res.dest]];
  uns b = merges[merges[y->res.dest]];
  COMPARE(a, b);
  COMPARE(x->hash, y->hash);
  return 0;
}

#include "lib/sorter.h"

struct rt_best {
  struct resolve_output res;
  u16 index;
  u16 count;
  byte weight;
} PACKED;

#define SORT_KEY struct rt_best
#define SORT_PREFIX(x) rt_best_##x
#define SORT_PRESORT
#define SORT_INPUT_FB
#define SORT_OUTPUT_FB
#define SORT_REGULAR

static int
rt_best_compare(struct rt_best *x, struct rt_best *y)
{
  COMPARE(x->res.src, y->res.src);
  COMPARE(x->index, y->index);
  return 0;
}

#include "lib/sorter.h"

static struct fastbuf *
analyse_reftexts(void)
{
  struct fastbuf *ref = index_bopen(fn_ref_texts, O_RDONLY);
  uns cut = 0;
  struct fastbuf *fps = bopen_tmp(indexer_fb_size);
  struct rt_fp rt1;
  bb_t bb;
  uns last_id = ~0, last_index = ~0;
  bb_init(&bb);
  while ((rt1.fp.cardid = bgetl(ref)) != ~0U)
  {
    ASSERT(!(rt1.fp.cardid & ETYPE_MASK));
    breadb(ref, &rt1.fp.fp, sizeof(struct fingerprint));
    uns l = bgetw(ref);
    bb_grow(&bb, l+1);
    bread(ref, bb.ptr, l);
    bb.ptr[l] = 0;
    rt1.index = last_index = (rt1.fp.cardid==last_id) ? last_index+1 : 0;
    rt1.hash = rt_hf(bb.ptr);
    last_id = rt1.fp.cardid;
    if (last_index < 0x10000)
      bwrite(fps, &rt1, sizeof(struct rt_fp));
    else
      cut++;
  }
  brewind(fps);
  bclose(ref);
  bb_done(&bb);
  log(L_INFO, "Cut %u reftexts", cut);
  return fps;
}

static struct fastbuf *
prune_references(struct fastbuf *ids)
{
  struct fastbuf *ids2 = bopen_tmp(indexer_fb_size);
  struct rt_id rt2;
  uns pruning_in = 0, pruning_out = 0;
  while (bread(ids, &rt2, sizeof(struct rt_id)))
  {
    pruning_in++;
    uns src = rt2.res.src, dest= rt2.res.dest;
    ASSERT(dest < FIRST_ID_NEW);
    if (merges[src] != ~0U && merges[dest] != ~0U			/* src/dest card not dropped */
	&& merges[merges[src]] != ~0U && merges[merges[dest]] != ~0U	/* final src/dest card is nonempty */
	&& merges[merges[src]] != merges[merges[dest]]			/* not a self-link */
       )
    {
      bwrite(ids2, &rt2, sizeof(struct rt_id));
      pruning_out++;
    }
  }
  bclose(ids);
  brewind(ids2);
  log(L_INFO, "Taken %u from %u reftexts", pruning_out, pruning_in);
  return ids2;
}

static struct rt_best *heap;
static uns heap_tail;
static struct rt_id best_rt;

static uns last_dest;				// always merges[merges[..]]
static uns total_cnt;
static float total_wt, best_wt;
static uns select_in, select_out;

static inline void
heap_insert_best(void)
{
#define	WORST_LESS(a,b) (a.weight < b.weight)
  if (heap_tail >= ref_max_count+1)
    {
      DBG("- %d %d->%d %d", heap[1].res.src, heap[1].res.dest, merges[merges[heap[1].res.dest]], heap[1].weight);
      HEAP_DELMIN(struct rt_best, heap, heap_tail, WORST_LESS, HEAP_SWAP);
    }
  uns i = ++heap_tail;
  heap[i].res = best_rt.res;
  heap[i].index = best_rt.index;
  float x = logf(total_wt) / M_LN2 * 8.;
  heap[i].count = total_cnt;
  heap[i].weight = CLAMP(x/2, 0, 255);		// divide by 2 to fit into byte
  HEAP_INSERT(struct rt_best, heap, heap_tail, WORST_LESS, HEAP_SWAP);
};

static inline void
heap_flush(struct fastbuf *rts)
{
#define	BEST_LESS(a,b) (a.weight > b.weight)
  if (last_dest == ~0U)
    return;
  heap_insert_best();
  HEAP_INIT(struct rt_best, heap, heap_tail, BEST_LESS, HEAP_SWAP);
  select_out += heap_tail;
  while (heap_tail)
  {
    bwrite(rts, heap+1, sizeof(struct rt_best));
    DBG("! %d %d->%d %d", heap[1].res.src, heap[1].res.dest, merges[merges[heap[1].res.dest]], heap[1].weight);
    HEAP_DELMIN(struct rt_best, heap, heap_tail, BEST_LESS, HEAP_SWAP);
  }
};

static struct fastbuf *
select_best(struct fastbuf *ids)
{
  struct fastbuf *rts = bopen_tmp(indexer_fb_size);
  heap = alloca((ref_max_count+2) * sizeof(struct rt_best));
  heap_tail = 0;
  last_dest = ~0U;
  total_cnt = 0;
  total_wt = best_wt = 0.;
  select_in = select_out = 0;

  attrs_part_map(0);
  byte *card_weights;
  READ_ATTR(card_weights, weight);
  attrs_part_unmap();

  struct rt_id rt2;
  while (bread(ids, &rt2, sizeof(struct rt_id)))
  {
    select_in++;
    uns src = rt2.res.src;
    uns dest = merges[merges[rt2.res.dest]];
    float wt = expf(card_weights[src] * M_LN2 / 8.);
    if (dest == last_dest && rt2.hash == best_rt.hash)
    {
      if (wt > best_wt)
      {
	best_rt = rt2;
	best_wt = wt;
      }
    }
    else
    {
      if (dest != last_dest)
      {
	heap_flush(rts);
	last_dest = dest;
      }
      else
	heap_insert_best();
      best_rt = rt2;
      best_wt = wt;
      total_cnt = 0;
      total_wt = 0.;
    }
    total_cnt++;
    total_wt += wt;
  }
  heap_flush(rts);
  bclose(ids);
  brewind(rts);
  xfree(card_weights);
  log(L_INFO, "Selected %u reftexts from %u", select_out, select_in);
  return rts;
}

static void
dump_reftexts(struct fastbuf *rts)
{
  struct fastbuf *ref = index_bopen(fn_ref_texts, O_RDONLY);
  struct fastbuf *urls = index_bopen(fn_urls, O_RDONLY);
  struct fastbuf *labels = index_bopen(fn_labels_by_id, O_WRONLY | O_APPEND);
  put_attr_set_type(BUCKET_TYPE_V33);
  struct rt_best rt3;
  uns curr_url = ~0U, curr_index = ~0U;
  uns dumped = 0;
  byte url[MAX_URL_SIZE];
  bb_t bb;
  bb_init(&bb);
  while (bread(rts, &rt3, sizeof(struct rt_best)))
  {
    uns cardid, len;
    DBG("Looking for %x %d", rt3.res.src, rt3.index);
    if (rt3.res.src != curr_url)
      curr_index = ~0U;
    while (1)
    {
      cardid = bgetl(ref);
      ASSERT(cardid != ~0U);
      bskip(ref, sizeof(struct fingerprint));
      len = bgetw(ref);
      bb_grow(&bb, len+1);
      bread(ref, bb.ptr, len);
      bb.ptr[len] = 0;
      if (cardid == rt3.res.src)
      {
	curr_index++;
	if (curr_index == rt3.index)
	  break;
      }
      else
	curr_index = ~0U;
      DBG("Skipped %x %d", cardid, curr_index);
    }
    while (curr_url != rt3.res.src)
    {
      bgets(urls, url, MAX_URL_SIZE);
      curr_url++;
    }
    bputl(labels, rt3.res.dest);
    bputc(labels, LABEL_TYPE_BODY);
    bput_attr_format(labels, 'x', "%s %d %d %s", url, rt3.weight*2, rt3.count, bb.ptr);
    bput_attr_separator(labels);
    dumped++;
  }
  bb_done(&bb);
  bclose(ref);
  bclose(urls);
  bclose(labels);
  bclose(rts);
  log(L_INFO, "Dumped %u reftexts", dumped);
}

int
main(int argc, char **argv)
{
  log_init(argv[0]);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  merges_map(0);
  log(L_INFO, "Analysing reference texts");
  struct fastbuf *fb = analyse_reftexts();

  log(L_INFO, "Sorting by fingerprints");
  fpsort_bs = resolve_optimize_run_length(fb);
  fb = rt_fp_sort(fb);

  log(L_INFO, "Resolving the fingerprints");
  fb = resolve_fastbuf(fb, RESOLVE_SKIP_UNKNOWN | RESOLVE_SKIP_NEW, sizeof(struct rt_fp) - sizeof(struct card_print));

  log(L_INFO, "Pruning references");
  fb = prune_references(fb);

  log(L_INFO, "Sorting by merged destination and hashes");
  fb = rt_id_sort(fb);

  log(L_INFO, "Selecting best reftexts");
  fb = select_best(fb);
  merges_unmap();

  log(L_INFO, "Sorting by source id and index");
  fb = rt_best_sort(fb);

  log(L_INFO, "Dumping reftexts and urls into labels");
  dump_reftexts(fb);

  return 0;
}
