/*
 *	Sherlock Indexer -- Merging of Identical Documents
 *
 *	(c) 2001--2003 Martin Mares <mj@ucw.cz>
 *	(c) 2003 Robert Spalek <robert@ucw.cz>
 *
 *	This module identifies equivalence classes defined by the Merges
 *	array and merges each class to a single primary card (the card
 *	with largest weight in the class).
 *	The primary card is marked with CARD_FLAG_MERGED and its attributes
 *	are combined with attributes of the other cards which receive
 *	CARD_FLAG_DUP. The mapping secondary -> primary is stored to the
 *	Merges array (which is also used as a work-space), cards with
 *	CARD_FLAG_EMPTY set are not touched (their positions in the array
 *	are used to denote redirects).
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/url.h"
#include "lib/fastbuf.h"
#include "indexer/indexer.h"
#include "indexer/attrs.h"
#include "indexer/merges.h"

#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>

static byte *fn_class_log;
static uns class_log_threshold, giant_documents, giant_redirects;

static struct cfitem merger_config[] = {
  { "Merger",		CT_SECTION,	NULL },
  { "ClassLog",		CT_STRING,	&fn_class_log },
  { "LogThreshold",	CT_INT,		&class_log_threshold },
  { "GiantDocuments",	CT_INT,		&giant_documents },
  { "GiantRedirects",	CT_INT,		&giant_redirects },
  { NULL,		CT_STOP,	NULL }
};

static uns giant_cnt;

static u32 *
merge_cards(struct card_attr *attrs, uns card_count)
{
  /* For each class, find the best card there */
  u32 *best = xmalloc(4*card_count);
  for (uns i=0; i<card_count; i++)
    best[i] = ~0U;
  for (uns i=0; i<card_count; i++)
    if (!(attrs[i].flags & CARD_FLAG_EMPTY))
      {
	uns root = merges_find_root(i);
	uns j = best[root];
	if (j == ~0U
	    || attrs[i].weight > attrs[j].weight
	    )
	  best[root] = i;
      }

  /* Merge each class to its best card */
  for (uns i=0; i<card_count; i++)
    if (!(attrs[i].flags & CARD_FLAG_EMPTY))
      {
	/* We assume path compression has been done by the previous pass */
	uns dest;
	if (merges[i] == ~0U)
	  merges[i] = i;
	dest = best[merges[i]];
	ASSERT(dest != ~0U);
	merges[i] = dest;
      }

  /* Update card attributes */
  for (uns i=0; i<card_count; i++)
    if (!(attrs[i].flags & CARD_FLAG_EMPTY) && merges[i] != i)
      {
	attrs[i].flags |= CARD_FLAG_DUP;
	attrs[merges[i]].flags |= CARD_FLAG_MERGED;
      }

  /* Mmap card-notes for saving giant penalties */
  struct partmap *notesmap = partmap_open(index_name(fn_notes), 1);
  ASSERT(partmap_size(notesmap) == (sh_off_t) (sizeof(struct card_note) * card_count));

  if (giant_redirects)
  {
    /* Re-use "best" for class sizes including redirects */
    bzero(best, 4*card_count);
    for (uns i=0; i<card_count; i++)
      best[merges[i]]++;

    /* Apply penalties */
    for (uns i=0; i<card_count; i++)
      if (!(attrs[i].flags & CARD_FLAG_EMPTY) && best[merges[i]] >= giant_redirects)
      {
	struct card_note *note = bring_note(notesmap, i);
	note->flags |= CARD_NOTE_GIANT;
      }
  }

  /* Re-use "best" for class sizes */
  bzero(best, 4*card_count);
  for (uns i=0; i<card_count; i++)
    if (!(attrs[i].flags & CARD_FLAG_EMPTY))
      best[merges[i]]++;

  /* Apply penalties */
  if (giant_documents)
    {
      for (uns i=0; i<card_count; i++)
	if (!(attrs[i].flags & CARD_FLAG_EMPTY) && best[merges[i]] >= giant_documents)
	  {
	    struct card_note *note = bring_note(notesmap, i);
	    note->flags |= CARD_NOTE_GIANT;
	  }
    }

  /* Take a note on the weights */
  giant_cnt = 0;
  for (uns i=0; i<card_count; i++)
    {
      struct card_note *note = bring_note(notesmap, i);
      if (note->flags & CARD_NOTE_GIANT)
	giant_cnt++;
      note->weight_merged = attrs[i].weight;
    }
  partmap_close(notesmap);

  return best;
}

static void
show_stats(struct card_attr *attrs, uns card_count, u32 *sizes)
{
  uns class_cnt = 0, dup_cnt = 0, max_chain = 0;

  for (uns i=0; i<card_count; i++)
    if (!(attrs[i].flags & CARD_FLAG_EMPTY))
      {
	if (attrs[i].flags & CARD_FLAG_DUP)
	  dup_cnt++;
	else if (attrs[i].flags & CARD_FLAG_MERGED)
	  class_cnt++;
	max_chain = MAX(max_chain, sizes[i]);
      }

  log(L_INFO, "Merged %d cards: %d non-trivial classes (max %d), %d duplicates, %d penalized", card_count, class_cnt,
      max_chain, dup_cnt, giant_cnt);
}

int
main(int argc, char **argv)
{
  log_init(argv[0]);
  cf_register(merger_config);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }
  log(L_INFO, "Merging cards");

  attrs_map(1);
  merges_map(1);

  /* Select primary cards, rewrite merges[] and calculate class sizes */
  u32 *sizes = merge_cards(attrs, card_count);

  /* Show merging statistics */
  show_stats(attrs, card_count, sizes);

  attrs_unmap();
  merges_unmap();

  return 0;
}
