/*
 *	Sherlock Indexer -- Digestive Processes
 *
 *	(c) 2001--2004 Martin Mares <mj@ucw.cz>
 *	(c) 2003--2004 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/lists.h"
#include "lib/conf.h"
#include "lib/fastbuf.h"
#include "lib/mempool.h"
#include "lib/url.h"
#include "sherlock/object.h"
#include "lib/hashfunc.h"
#include "sherlock/tagged-text.h"
#include "lib/unicode.h"
#include "sherlock/lizard-fb.h"
#include "sherlock/bucket.h"
#include "charset/unicat.h"
#include "indexer/indexer.h"
#include "indexer/lexicon.h"

#include <string.h>
#include <stdlib.h>
#include <fcntl.h>
#include <alloca.h>

static struct fastbuf *cards_mem, *cards_out;
static uns card_id;

#undef PROFILE_TSC
#include "lib/profile.h"
static prof_t pr_fetch, pr_preproc, pr_strings, pr_words, pr_prints, pr_cards;

/*
 *  Configuration
 */

static uns word_buf_size = 65536;
static uns string_buf_size = 65536;
static uns string_cats = ~0;
static uns string_max = 4096;
static uns phrase_limit = 4095;
static byte *card_attr_list = "U";
static uns excerpt_max = ~0;
static uns doc_buf_size = 65536;
static uns giant_ban_meta, giant_penalty;
static uns type_weights[WT_MAX];
static uns swindler_threshold = ~0;
static uns no_contents_penalty, no_contents_threshold;
static struct list hypertext_types;
static uns no_links_penalty;
static uns no_title_penalty;
static uns min_compression;

struct hypertext_type
{
  struct node n;
  byte *content_type;
};

static byte *
cf_type_weights(struct cfitem *a UNUSED, byte *c)
{
  byte *fields[WT_MAX];

  if (wordsplit(c, fields, WT_MAX) != WT_MAX)
    return "Invalid weight definition";
  for (int i=0; i<WT_MAX; i++)
    {
      uns w = atol(fields[i]);
      type_weights[i] = w;
    }
  return NULL;
}

static byte *
add_hypertext_type(struct cfitem *a UNUSED, byte *c)
{
  struct hypertext_type *ht = cfg_malloc(sizeof(struct hypertext_type));
  ht->content_type = c;
  add_tail(&hypertext_types, &ht->n);
  return NULL;
}

static struct cfitem chewer_config[] = {
  { "Chewer",		CT_SECTION,	NULL },
  { "WordBufSize",	CT_INT,		&word_buf_size },
  { "StringBufSize",	CT_INT,		&string_buf_size },
  { "StringCats",	CT_INT,		&string_cats },
  { "StringMax",	CT_INT,		&string_max },
  { "PhraseLimit",	CT_INT,		&phrase_limit },
  { "CardAttrs",	CT_STRING,	&card_attr_list },
  { "ExcerptMax",	CT_INT,		&excerpt_max },
  { "DocBufSize",	CT_INT,		&doc_buf_size },
  { "GiantBanMeta",	CT_INT,		&giant_ban_meta },
  { "GiantPenalty",	CT_INT,		&giant_penalty },
  { "TypeWeights",	CT_FUNCTION,	&cf_type_weights },
  { "SwindlerThreshold",CT_INT,		&swindler_threshold },
  { "NoContentsPenalty",CT_INT,		&no_contents_penalty },
  { "NoContentsThreshold",CT_INT,	&no_contents_threshold },
  { "AddHyperTextType",	CT_FUNCTION,	&add_hypertext_type },
  { "NoLinksPenalty",	CT_INT,		&no_links_penalty },
  { "NoTitlePenalty",	CT_INT,		&no_title_penalty },
  { "MinCompression",	CT_INT,		&min_compression },
  { NULL,		CT_STOP,	NULL }
};

static void CONSTRUCTOR chewconf_init(void)
{
  cf_register(chewer_config);
  init_list(&hypertext_types);
}

/*
 *  Indexing of strings
 */

static struct fastbuf *string_index;

struct sentry {
  struct fingerprint fp;
  u32 id;
  u32 type;
};

static struct sentry *string_buf;
static uns sbuf_count, sbuf_size, sbuf_limit;
static uns string_runs;
static u64 string_cnt, string_dropped_cnt;

static void
string_init(void)
{
  string_index = index_bopen(fn_string_index, O_WRONLY | O_CREAT | O_TRUNC);
  sbuf_size = string_buf_size / sizeof(struct sentry);
  if (sbuf_size < string_max)
    {
      log(L_WARN, "StringMax entries don't fit in StringBufSize, increasing StringBufSize to %d", string_max * sizeof(struct sentry));
      sbuf_size = string_max;
    }
  string_buf = xmalloc(sbuf_size * sizeof(struct sentry));
  sbuf_limit = sbuf_size - string_max;
  DBG("Allocated string pool with %d entries", sbuf_size);
}

static inline int
string_cmp(const struct sentry *a, const struct sentry *b)
{
  int e = memcmp(&a->fp, &b->fp, sizeof(struct fingerprint));
  if (e < 0)
    return 1;
  if (e > 0)
    return 0;
  COMPARE_LT(a->id, b->id);
  COMPARE_LT(a->type, b->type);
  return 0;
}

#define ASORT_PREFIX(x) string_##x
#define ASORT_KEY_TYPE struct sentry
#define ASORT_ELT(i) string_buf[i]
#define ASORT_LT(x,y) string_cmp(&(x), &(y))
#include "lib/arraysort.h"

static void
string_flush(void)
{
  if (!sbuf_count)
    return;
  string_sort(sbuf_count);

  uns i = 0;
  while (i < sbuf_count)
    {
      /* calculate chain length */
      uns start = i++;
      uns size = 8;
      struct sentry *first = &string_buf[start];
      for(;;)
	{
	  struct sentry *e = &string_buf[i];
	  struct sentry *prev = e-1;
	  if (memcmp(&first->fp, &e->fp, sizeof(struct fingerprint)))
	    break;
	  if (e->id != prev->id)
	    size += 8;
	  else if (e->type != prev->type)
	    size += 2;
	  i++;
	}

      /* output the chain */
      bwrite(string_index, &first->fp, sizeof(struct fingerprint));
      bputl(string_index, size);
      sh_off_t expected_end = btell(string_index) + size;
      struct sentry *cfirst = &string_buf[start];
      struct sentry *clast = &string_buf[i];
      while (cfirst < clast)
	{
	  struct sentry *e = cfirst+1;
	  size = 1;
	  while (e < clast && e->id == cfirst->id)
	    {
	      if (e->type != (e-1)->type)
		size++;
	      e++;
	    }
	  bputl(string_index, cfirst->id);
	  bputw(string_index, size);
	  uns last_type = ~0U;
	  while (cfirst < e)
	    {
	      if (cfirst->type != last_type)
		{
		  bputw(string_index, (cfirst->type << 12) | 0xfff);
		  last_type = cfirst->type;
		}
	      cfirst++;
	    }
	}
      ASSERT(btell(string_index) == expected_end);
    }
  string_runs++;
  string_cnt += sbuf_count;
  sbuf_count = 0;
}

static void
string_end(void)
{
  string_flush();
  bclose(string_index);
  log(L_INFO, "Generated %Ld strings in %d runs; %Ld strings trimmed", string_cnt, string_runs, string_dropped_cnt);
}

static void
string_add(byte *s, uns type)
{
  if (!(string_cats & (1 << type)))
    return;
  if (sbuf_count >= sbuf_size)
    {
      string_dropped_cnt++;
      return;
    }

  struct sentry *e = &string_buf[sbuf_count++];
  fingerprint(s, &e->fp);
  e->id = card_id;
  e->type = type;
}

struct str_list {
  uns attr, type;
};

static void
string_add_attrs(struct odes *o, struct str_list *strs)
{
  while (strs->attr)
    {
      for (struct oattr *a=obj_find_attr(o, strs->attr); a; a=a->same)
	{
	  byte *x = a->val;
	  while (*x && *x != ' ')
	    x++;
	  if (*x)
	    {
	      *x = 0;
	      string_add(a->val, strs->type);
	      *x = ' ';
	    }
	  else
	    string_add(a->val, strs->type);
	}
      strs++;
    }
}

static void
string_card(struct odes *o, struct card_hdr *hdr)
{
  for (struct card_hdr *h=hdr; h; h=h->next)
    {
      byte *url = obj_find_aval(h->odes, 'U');
      ASSERT(url);
      byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE];
      struct url u;
#ifdef ST_URL
      string_add(url, ST_URL);
#endif
#if defined(ST_HOST) && defined(ST_DOMAIN)
      if (!url_canon_split(url, buf1, buf2, &u) && u.host)
	{
	  byte *dot = strchr(u.host, '.');
	  string_add(u.host, ST_HOST);
	  if (dot && strchr(dot+1, '.'))
	    string_add(dot+1, ST_DOMAIN);
	}
#endif
#ifdef ST_URL
      for (struct card_hdr *r=h->redirects; r; r=r->next)
	string_add_attrs(r->odes, (struct str_list []) {
	    { 'y', ST_URL },
	    { 0, 0 }
	    });
#endif
    }

#ifdef ST_REF
  string_add_attrs(o, (struct str_list []) {
    { 'A', ST_REF },
    { 'F', ST_REF },
    { 'I', ST_REF },
    { 'R', ST_REF },
    { 'Y', ST_REF },
    { 'a', ST_REF },
    { 'd', ST_REF },
    { 'f', ST_REF },
    { 0, 0 }
    });
#endif

  custom_index_strings(o, string_add);

  if (sbuf_count >= sbuf_limit)
    string_flush();
}

/*
 *  Preprocessing of documents
 */

static byte *doc_buf;
static uns doc_length, doc_length_max;
static uns trim_counter;
static uns char_counter[WT_MAX], translate_type[WT_MAX];
static uns has_title, is_hypertext, is_in_catalog;
static uns swindler_detected;
static uns cards_compr, cards_uncompr;
static sh_off_t total_compr, total_uncompr;

static void
preproc_init(void)
{
  doc_buf = xmalloc(doc_buf_size+1) + 1;
  doc_buf[-1] = 0;
}

static void
preproc_end(void)
{
  log(L_INFO, "Trimmed %d cards, longest was %d chars", trim_counter, doc_length_max);
  log(L_INFO, "Compressed %d cards, uncompressed %d, compressed to %d%%", cards_compr, cards_uncompr, (uns) ( 100. * total_compr / (total_uncompr ? : 1)));
}

static void
preproc_card(struct odes *o)
{
  byte *w = doc_buf;
  byte *stop = doc_buf + doc_buf_size - MAX_ATTR_SIZE;

  bzero(char_counter, sizeof(char_counter));
  uns cur_type = WT_TEXT;
  struct oattr *oa = obj_find_attr(o, 'X');
  byte *r;
  if (oa && (oa->val[0] < 0x80 || oa->val[0] >= 0xa0))
    {
      /*
       *  If by any accident the card doesn't start with a category switch, add it.
       *  It doesn't mean the card is invalid -- it can start with a ref bracket followed
       *  by a category switch, but it's better to be safe then sorry. Also, we'd like
       *  to keep the "first come category switch" invariant for the search server.
       */
      *w++ = 0x80 | WT_TEXT;
      r = oa->val;
      goto run_for_it;
    }
  for (; oa; oa=oa->same)
    {
      r = oa->val;
      uns c;

      if (w >= stop)
	{
	  trim_counter++;
	  break;
	}
      if ((*r < 0x80 || *r >= 0xa0) && r[-1])
	*w++ = ' ';
    run_for_it:
      while (c = *r++)
	{
	  if (c < 0x80)
	  {
	    *w++ = c;
	    char_counter[cur_type]++;
	  }
	  else if (c >= 0xc0)
	    {
	      *w++ = c;
	      while (*r && (c & 0x40))
		{
		  *w++ = *r++;
		  c <<= 1;
		}
	      char_counter[cur_type]++;
	    }
	  else				/* Process a sequence of brackets and category changers */
	    {
	      uns cat = 0x80;
	      for(;;)
		{
		  if (c < 0xa0)		/* Category changes and breaks are compressed */
		    cat = (cat & 0x10) | (c & 0x1f);
		  else
		    {			/* Brackets are to be removed */
		      if (c < 0xb0 && *r)
			r++;
		    }
		  c = *r;
		  if (c < 0x80 || c >= 0xc0)
		    break;
		  r++;
		}
	      if (!(cat & 0x80))
		{
		  *w++ = 0x80 + cat;
		  cur_type = cat & 0x0f;
		}
	    }
	}
    }
  *w = 0;
  doc_length = w - doc_buf;
  if (doc_length > doc_length_max)
    doc_length_max = doc_length;
}

static uns
average_weight(void)
{
  uns cnt=0, wt=0;
  for (uns i=0; i<WT_MAX; i++)
    if (type_weights[i])
    {
      cnt += char_counter[i];
      wt += char_counter[i] * type_weights[i];
    }
  if (cnt)
    wt /= cnt;
  return wt;
}

static byte *wt_names[8] = { WORD_TYPE_USER_NAMES };

static void
detect_swindler(void)
{
  /* By default, we do not translate any type */
  for (uns i=0; i<WT_MAX; i++)
    translate_type[i] = i;

  /* Compute the initial average weight */
  uns wt = average_weight();
  swindler_detected = (wt > swindler_threshold);

#undef	DEBUG_SWINDLERS
#ifdef	DEBUG_SWINDLERS
  static uns id = 0;
  id++;
  byte buf[500];
  uns len = sprintf(buf, "Frequencies of document %x: ", id);
  uns total_cnt = 0;
  for (uns i=0; i<WT_MAX; i++)
    if (char_counter[i])
    {
      len += sprintf(buf+len, "%s:%d ", wt_names[i], char_counter[i]);
      total_cnt += char_counter[i];
    }
  len += sprintf(buf+len, "TOTAL:%d avg-wt:%d", total_cnt, wt);
  if (swindler_detected)
    len += sprintf(buf+len, " SWINDLER!");
  puts(buf);
#endif

  if (!swindler_detected)
    return;

  do
  {
    int i;
    /* Find a used type with the biggest weight and the type the nearest
     * smaller weight */
    int max = -1, max2 = -1;
    for (i=0; i<WT_MAX; i++)
      if (char_counter[i] && (max<0 || type_weights[i] > type_weights[max]))
	max = i;
    ASSERT(max >= 0 && type_weights[max] > 0);
    for (i=0; i<WT_MAX; i++)
      if (type_weights[i] < type_weights[max]
	  && (max2<0 || type_weights[i] >= type_weights[max2]))
	max2 = i;
    /* Beware!  There's a little magic with > and >= on type_weights, do not change it!  */
    ASSERT(max2 >= 0);
    if (!type_weights[max2])
      die("Invalid configuration of Chewer.TypeWeights, you must have a type with weight smaller than SwindlerThreshold");
    /* Remap the type */
#ifdef	DEBUG_SWINDLERS
    printf("Type %s with frequency %d remapped to %s, ", wt_names[max], char_counter[max], wt_names[max2]);
#endif
    for (i=0; i<WT_MAX; i++)
      if (translate_type[i] == (uns)max)
      {
	translate_type[i] = max2;
	char_counter[max2] += char_counter[i];
	char_counter[i] = 0;
      }
    wt = average_weight();
#ifdef	DEBUG_SWINDLERS
    printf("new average weight is %d\n", wt);
#endif
  }
  while (wt > swindler_threshold);

#ifdef	DEBUG_SWINDLERS
  len = sprintf(buf, "New types: ");
  for (uns i=0; i<WT_MAX; i++)
    if (translate_type[i] != i)
      len += sprintf(buf+len, "%s->%s ", wt_names[i], wt_names[translate_type[i]]);
  puts(buf);
#endif
}

static void
penalize_card(struct card_attr *attr, struct card_note *note, int filter_bonus, uns is_image)
{
  int wt = attr->weight;
  if (note->flags & CARD_NOTE_GIANT)
  {
    bput_attr_format(cards_mem, '.', ".Penalized by %d: giant class", giant_penalty);
    wt -= giant_penalty;
  }
  if (swindler_detected)
  {
    byte mappings[WT_MAX * 32], *ptr = mappings;
    for (uns i=0; i<WT_MAX; i++)
      if (translate_type[i] != i)
	ptr += sprintf(ptr, "%s->%s ", wt_names[i], wt_names[translate_type[i]]);
    bput_attr_format(cards_mem, '.', "Penalized by 0: swindling, remapped %s", mappings);
  }
  if (!is_image && !is_in_catalog)
  {
    if (no_contents_penalty && note->useful_size < no_contents_threshold)
    {
      bput_attr_format(cards_mem, '.', "Penalized by %d: no contents (%d chars)", no_contents_penalty, note->useful_size);
      wt -= no_contents_penalty;
    }
  }
  if (is_hypertext && !is_in_catalog)
  {
    if (no_links_penalty && !(note->flags & CARD_NOTE_HAS_LINKS))
    {
      bput_attr_format(cards_mem, '.', "Penalized by %d: no links", no_links_penalty);
      wt -= no_links_penalty;
    }
#ifdef	MT_TITLE
    if (no_title_penalty && !has_title)
    {
      bput_attr_format(cards_mem, '.', "Penalized by %d: no title", no_title_penalty);
      wt -= no_title_penalty;
    }
#endif
  }
  if (filter_bonus)
  {
    bput_attr_format(cards_mem, '.', "Penalized by %d: filters", -filter_bonus);
    wt += filter_bonus;
  }
  attr->weight = CLAMP(wt, 0, 255);
}

/*
 *  Indexing of words
 */

struct fastbuf *word_index;

#define LENT_QUANTUM 3
#define LENT_BITE 4096

typedef struct lentry {
  struct lentry *next;
  u32 id;
  u16 count;
  u16 w[LENT_QUANTUM];
} lentry;

static uns lentry_count, lentry_limit;
static struct verbum **context_words;
static struct verbum **used_words, **used_words_last;
static struct mempool *word_pool;
static uns word_runs;
static u64 word_entries, word_cnt;
static uns meta_static_part;
static uns numerus_verba;

#define LH_CHEWER
#include "indexer/lexhash.h"

static void
lex_load(void)
{
  struct fastbuf *b;
  struct verbum *v, **ctxt_last=NULL;

  b = index_bopen(fn_lex_ordered, O_RDONLY);
  numerus_verba = bgetl(b);
  if (lex_context_slots)
    {
      context_words = xmalloc_zero(sizeof(struct verbum *) * lex_context_slots);
      ctxt_last = alloca(sizeof(struct verbum *) * lex_context_slots);
    }
  for (uns i=0; i<numerus_verba; i++)
    {
      u32 id = bgetl(b);
      u32 cnt = bgetl(b);
      uns ctxt = bget_context(b);
      enum word_class class = id & 7;
      uns len = bgetc(b);
      if (class == WC_COMPLEX)
	v = ctxt_last[ctxt]++;
      else
	{
	  byte buf[MAX_WORD_LEN+1];
	  breadb(b, buf, len);
	  buf[len] = 0;
	  v = lh_insert(buf, 0);
	  if (!v)
	    die("Malformed lexicon: Duplicate word <%s>", buf);
	  if (class == WC_CONTEXT)
	    ctxt_last[ctxt] = context_words[ctxt] = xmalloc_zero(sizeof(struct verbum)*2*lex_context_slots);
	}
      v->id = id;
      v->u.count = cnt;
      PUT_CONTEXT(&v->context_class, ctxt);
    }
  bclose(b);
  lh_rehash(lh_hash_count);		/* Sort the chains by counts */
  LH_WALK(v)
    v->u.first_lent = NULL;
  log(L_INFO, "Read lexicon with %d words (%d total entries)", lh_hash_count, numerus_verba);
}

static void
word_flush_single(uns id, lentry *head)
{
  lentry *f, *g;
  uns size;
  sh_off_t expos;

  /* The chain is reversed, flip it back */
  g = NULL;
  while (head)
    {
      f = head->next;
      head->next = g;
      g = head;
      head = f;
    }
  head = g;

  /* Calculate block size */
  size = 0;
  f = head;
  while (f)
    {
      size += 6;
      g = f;
      while (f && f->id == g->id)
	{
	  size += 2*f->count;
	  f = f->next;
	}
    }

  /* Write header */
  bputl(word_index, id);
  bputl(word_index, size);
  expos = btell(word_index) + size;

  /* Write body */
  f = head;
  while (f)
    {
      bputl(word_index, f->id);
      g = f;
      size = 0;
      while (g && g->id == f->id)
	{
	  size += g->count;
	  g = g->next;
	}
      bputw(word_index, size);
      g = f;
      while (f && g->id == f->id)
	{
	  bwrite(word_index, f->w, 2*f->count);
	  f = f->next;
	}
    }

  /* Sanity check */
  if (expos != btell(word_index))
    die("word_flush_single: Internal error, size mismatch");
}

#define ASORT_PREFIX(x) word_##x
#define ASORT_KEY_TYPE u32
#define ASORT_ELT(i) used_words[i]->id
#define ASORT_SWAP(i,j) do { struct verbum *tmp=used_words[i]; used_words[i]=used_words[j]; used_words[j]=tmp; } while(0)
#include "lib/arraysort.h"

static void
word_flush(void)
{
  struct verbum **w, *v;

  word_sort(used_words_last - used_words);
  *used_words_last = NULL;
  for (w=used_words; v = *w; w++)
    {
      word_flush_single(v->id/8, v->u.first_lent);
      v->u.first_lent = NULL;
    }

  mp_flush(word_pool);
  word_runs++;
  word_entries += lentry_count;
  lentry_count = 0;
  used_words_last = used_words;
}

static inline int
word_check(struct verbum *v, u16 pack)
{
  lentry *e = v->u.first_lent;

  while (e && e->id == card_id)
    {
      int i = e->count - 1;
      while (i >= 0)
	{
	  if (e->w[i] == pack)
	    return 1;
	  if ((e->w[i] ^ pack) & 0xfff)
	    return 0;
	  i--;
	}
      e = e->next;
    }
  return 0;
}

static inline void
word_add(struct verbum *v, u16 pack)
{
  lentry *e = v->u.first_lent;
  if (!e || e->id != card_id || e->count >= LENT_QUANTUM)
    {
      /* Trick: we allocate only lentries, so it will be always aligned */
      e = mp_alloc_fast_noalign(word_pool, sizeof(lentry));
      if (!v->u.first_lent)
	*used_words_last++ = v;
      e->next = v->u.first_lent;
      v->u.first_lent = e;
      e->id = card_id;
      e->count = 0;
      lentry_count++;
    }
  e->w[e->count++] = pack;
  word_cnt++;
}

static enum word_class
lm_lookup(enum word_class orig_class, word *uni, uns ulen, word_id_t *idp)
{
  struct verbum *v;

  if (orig_class != WC_NORMAL)
    return orig_class;
  v = lh_lookup(uni, ulen);
  *idp = v;
  return v->id & 7;
}

static void
lm_got_word(uns pos, uns cat, word_id_t w)
{
  if (meta_static_part)
    {
      if (pos < 0x200)
	word_add(w, meta_static_part | (pos << 2));
#ifdef	MT_TITLE
      if (cat == MT_TITLE)
	has_title = 1;
#endif
      return;
    }
  cat = translate_type[cat];
  if (pos >= phrase_limit)
    {
      if (!word_check(w, (cat << 12) | 4095))
	word_add(w, (cat << 12) | 4095);
    }
  else
    word_add(w, (cat << 12) | pos);
}

#ifdef CONFIG_CONTEXTS
static inline void
lm_got_complex(uns pos, uns cat, word_id_t root, word_id_t w, uns dir)
{
  struct verbum *v = context_words[root->context_class] + w->context_class + (dir ? lex_context_slots : 0);
  lm_got_word(pos, cat, v);
}
#else
static inline void
lm_got_complex(uns pos UNUSED, uns cat UNUSED, word_id_t root UNUSED, word_id_t w UNUSED, uns dir UNUSED)
{
}
#endif

#include "indexer/lexmap.h"

struct meta_info {
  struct meta_info *next;
  uns static_part;
  byte *text;
};
struct meta_info *meta_first[16], *meta_last[16];

static inline void
word_meta_add(byte *t, struct meta_info *m)
{
  if (*t >= '0' && *t <= '3')
    m->static_part = *t++ - '0';
  else
    m->static_part = 0;
  ASSERT(*t >= 0x90 && *t < 0xa0);
  uns type = *t & 0x0f;
  m->static_part |= 0x8000 | (type << 11);
  if (meta_first[type])
    meta_last[type]->next = m;
  else
    meta_first[type] = m;
  meta_last[type] = m;
  m->next = NULL;
  m->text = t;
}

static void
word_meta(struct odes *o, struct card_hdr *hdr, struct card_note *note)
{
  bzero(meta_first, sizeof(meta_first));
  for(struct card_hdr *h=hdr; h; h=h->next)
  {
    for (struct oattr *a=obj_find_attr(h->odes, 'M'); a; a=a->same)
      word_meta_add(a->val, alloca(sizeof(struct meta_info)));
    for (struct card_hdr *r=h->redirects; r; r=r->next)
      for (struct oattr *a=obj_find_attr(r->odes, 'M'); a; a=a->same)
	word_meta_add(a->val, alloca(sizeof(struct meta_info)));
  }
  for(struct oattr *a=obj_find_attr(o, 'M'); a; a=a->same)
    word_meta_add(a->val, alloca(sizeof(struct meta_info)));
  for(struct oattr *a=obj_find_attr(o, 'x'); a; a=a->same)
    {
      byte *c = a->val;
      for (uns i=0; i<3; i++)
	while (*c++ != ' ')
	  ;
      word_meta_add(c, alloca(sizeof(struct meta_info)));
    }

  uns permit_types = (note->flags & CARD_NOTE_GIANT) ? ~giant_ban_meta : ~0U;
  has_title = 0;
  for (uns type=0; type<16; type++)
    if (meta_first[type] && (permit_types & (1 << type)))
      {
	lm_doc_start();
	for (struct meta_info *m=meta_first[type]; m; m=m->next)
	  {
	    meta_static_part = m->static_part;
	    lm_map_text(m->text, m->text + str_len(m->text));
	  }
      }
}

static void
word_card(struct odes *o, struct card_hdr *hdr, struct card_note *note)
{
  lm_doc_start();
  meta_static_part = 0;
  lm_map_text(doc_buf, doc_buf + doc_length);
  word_meta(o, hdr, note);
  if (lentry_count >= lentry_limit)
    word_flush();
}

static void
word_init(void)
{
  lh_init();
  lm_init();
  word_index = index_bopen(fn_word_index, O_WRONLY | O_CREAT | O_TRUNC);
  lex_load();
  word_pool = mp_new(sizeof(lentry) * LENT_BITE);
  lentry_limit = word_buf_size / (sizeof(lentry) + sizeof(struct lentry *));
  /*
   * The used_words array is large, but most of it will be probably left unmapped.
   * Unfortunately, we don't have a better bound since lentry_count >= lentry_limit
   * testing is deferred to card end (which is faster and it also guarantees that no
   * ref chain for a (word,card) pair will be split, which is required by wsort).
   */
  used_words_last = used_words = xmalloc(sizeof(struct lentry *) * (numerus_verba + 1));
  DBG("Allocated word pool, lentry_limit=%d", lentry_limit);
}

static void
word_end(void)
{
  word_flush();
  bclose(word_index);
  log(L_INFO, "Generated %Ld word refs in %d runs", word_cnt, word_runs);
  DBG("Used %Ld entries, that is %Ld bytes", word_entries, word_entries * sizeof(lentry));
}

/*
 *  Processing of cards
 */

static struct fastbuf *card_attrs;

static void
cards_init(void)
{
  struct card_attr a;

  put_attr_set_type(BUCKET_TYPE_V33);
  lizard_set_type(BUCKET_TYPE_V33_LIZARD, min_compression / 100.);
  cards_out = index_bopen(fn_cards, O_WRONLY | O_CREAT | O_TRUNC);
  card_attrs = index_bopen(fn_card_attrs, O_WRONLY | O_CREAT | O_TRUNC);
  bzero(&a, sizeof(a));			/* Create dummy document 0 to make all read ID's be >0 */
  bwrite(card_attrs, &a, sizeof(a));
  card_id++;
}

static void
card_write_start(struct card_attr *attr)
{
  if (cards_mem)
    {
      uns len_in = btell(cards_mem);
      struct fastbuf *cards_in = fbmem_clone_read(cards_mem);
      uns type = lizard_bbcopy_compress(cards_out, cards_in, len_in);
      if (type == BUCKET_TYPE_V33_LIZARD)
	cards_compr++;
      else
	cards_uncompr++;
      total_uncompr += len_in;
      bclose(cards_in);
      bclose(cards_mem);
    }
  else
    {
      bputl(cards_out, BUCKET_TYPE_V33);
      bputl(cards_out, 0);
    }
  cards_mem = fbmem_create(2 * excerpt_max);

  uns align = (1 << CARD_POS_SHIFT) - 1;
  sh_off_t pos = btell(cards_out);
  while (pos & align)
    {
      bputc(cards_out, 0);
      pos++;
    }
  if ((u64)(pos >> CARD_POS_SHIFT) >= 0xffffffff)
    die("Card file too large. You need to increase CARD_POS_SHIFT in sherlock/index.h.");
  attr->card = pos >> CARD_POS_SHIFT;
}

static void
cards_end(void)
{
  struct card_attr a;

  bzero(&a, sizeof(a));			/* Append fake attribute marking end of card file */
  card_write_start(&a);
  bwrite(card_attrs, &a, sizeof(a));
  if (cards_mem)
    bclose(cards_mem);
  total_compr = btell(cards_out);
  bclose(cards_out);
  bclose(card_attrs);
  log(L_INFO, "Generated %d cards", card_id);
}

static void
probe_content_type(struct odes *o)
{
  byte *ctype = obj_find_aval(o, 'T');
  if (is_hypertext || !ctype)
    return;
  struct hypertext_type *ht;
  WALK_LIST(ht, hypertext_types)
    if (!strcmp(ctype, ht->content_type))
    {
      is_hypertext = 1;
      break;
    }
}

static void
card_dump_reftexts(struct odes *o)
{
  for (struct oattr *a=obj_find_attr(o, 'x'); a; a=a->same)
  {
    byte *c = strchr(a->val, ' ');
    *c++ = 0;
    uns wt, cnt, rd, len;
    rd = sscanf(c, "%d %d %n", &wt, &cnt, &len);
    ASSERT(rd == 2);
    c += len;
    bput_attr(cards_mem, '(', "x", 1);
    bput_attr_str(cards_mem, 'M', c);
    bput_attr_str(cards_mem, 'z', a->val);
    bput_attr_format(cards_mem, 'W', "x%d %d", wt, cnt);
    bput_attr(cards_mem, ')', "", 0);
  }
}

static void
card_card(struct odes *o, struct card_hdr *hdr, struct card_attr *attr, struct card_note *note, int bonus)
{
  /* First of all, dump all headers */
  is_hypertext = is_in_catalog = 0;
  for (struct card_hdr *h=hdr; h; h=h->next)
    {
      bput_attr(cards_mem, '(', "U", 1);
      {
	/*
	 * XXX: We do this only for compatibility with old code which
	 * didn't know anything about nesting and expected per-URL
	 * attributes to follow the URL.
	 */
	obj_move_attr_to_head(h->odes, 'U');
      }
      obj_write_nocheck(cards_mem, h->odes);
      probe_content_type(h->odes);
      if (!is_in_catalog && obj_find_aval(h->odes, 'K'))
	is_in_catalog = 1;
      for (struct card_hdr *r=h->redirects; r; r=r->next)
      {
	bput_attr(cards_mem, '(', "y", 1);
	obj_move_attr_to_head(r->odes, 'y');
	obj_write_nocheck(cards_mem, r->odes);
	if (!is_in_catalog && obj_find_aval(r->odes, 'K'))
	  is_in_catalog = 1;
	bput_attr(cards_mem, ')', "", 0);
      }
      bput_attr(cards_mem, ')', "", 0);
    }

  /* "N" attributes must go last (needed e.g. by the multiplexer) */
  obj_move_attr_to_tail(o, 'N');

  /* Then dump all other attributes */
  for (struct oattr *at=o->attrs; at; at=at->next)
    if (strchr(card_attr_list, at->attr))
      for (struct oattr *b=at; b; b=b->same)
	bput_attr_str(cards_mem, at->attr, b->val);

  /* Then take a note on evolution of the weight */
  bput_attr_format(cards_mem, 'W', "s%d", note->weight_scanner);
  bput_attr_format(cards_mem, 'W', "m%d", note->weight_merged);

  /* Perform final penalization and dump the penalized weight */
  penalize_card(attr, note, bonus, !!obj_find_attr(o, 'N'));
  bput_attr_format(cards_mem, 'W', "p%d", attr->weight);

  /* Document contents, but limited to the useful part */
  uns l = excerpt_max;
  if (l && doc_length)
  {
    if (l < doc_length)
    {
      uns i, maxl=MIN(l+256, doc_length);
      for (i=l; i<maxl; i++)
	if (doc_buf[i] == ' ')
	  break;
      if (i < maxl)
	l = i;
    }
    else
      l = doc_length;
    bput_attr(cards_mem, 'X', doc_buf, l);
  }

  /* And, finally, reftexts */
  card_dump_reftexts(o);
}

/*
 *  Processing of card fingerprints
 */

static struct fastbuf *card_prints;

static void
prints_init(void)
{
  card_prints = index_maybe_bopen(fn_card_prints, O_WRONLY | O_CREAT | O_TRUNC);
}

static void
prints_end(void)
{
  bclose(card_prints);
}

static void
prints_add(byte *url)
{
  struct card_print e;
  fingerprint(url, &e.fp);
  e.cardid = card_id;
  bwrite(card_prints, &e, sizeof(e));
}

static void
prints_card(struct card_hdr *hdr)
{
  if (!card_prints)
    return;

  while (hdr)
    {
      byte *url;
      if (url = obj_find_aval(hdr->odes, 'U'))
	prints_add(url);
      for (struct card_hdr *r = hdr->redirects; r; r=r->next)
	if (url = obj_find_aval(hdr->odes, 'y'))
	  prints_add(url);
      hdr = hdr->next;
    }
}

/*
 *  Main loop
 */

static void
chew_card(struct card_attr *attr, struct odes *o, struct card_hdr *hdr, struct card_note *note, int bonus)
{
  card_write_start(attr);
  prof_switch(&pr_fetch, &pr_preproc);
  preproc_card(o);
  detect_swindler();
  prof_switch(&pr_preproc, &pr_strings);
  string_card(o, hdr);
  prof_switch(&pr_strings, &pr_prints);
  prints_card(hdr);
  prof_switch(&pr_prints, &pr_words);
  word_card(o, hdr, note);
  prof_switch(&pr_words, &pr_cards);
  card_card(o, hdr, attr, note, bonus);
  prof_switch(&pr_cards, &pr_fetch);
  bwrite(card_attrs, attr, sizeof(*attr));
  card_id++;
  PROGRESS(card_id, "chewer: %d cards", card_id);
}

int
main(int argc, char **argv)
{
  log_init(argv[0]);
  setproctitle_init(argc, argv);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  cards_init();
  preproc_init();
  word_init();
  string_init();
  prints_init();

  prof_init(&pr_fetch);
  prof_init(&pr_strings);
  prof_init(&pr_words);
  prof_init(&pr_prints);
  prof_init(&pr_cards);

  prof_start(&pr_fetch);
  log(L_INFO, "Chewing cards and creating indices");
  fetch_cards(chew_card);
  prof_stop(&pr_fetch);

  cards_end();
  preproc_end();
  word_end();
  string_end();
  prints_end();

#ifdef PROFILER
  log(L_DEBUG, "Profile: fetch %s, preproc %s, strings %s, words %s, prints %s, cards %s", PROF_STR(pr_fetch),
      PROF_STR(pr_preproc), PROF_STR(pr_strings), PROF_STR(pr_words), PROF_STR(pr_prints), PROF_STR(pr_cards));
#endif

  return 0;
}
