/*
 *	Sherlock Indexer -- Lexicon Builder
 *
 *	(c) 2001--2003 Martin Mares <mj@ucw.cz>
 *	(c) 2001--2004 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/unaligned.h"
#include "lib/fastbuf.h"
#include "lib/hashfunc.h"
#include "sherlock/object.h"
#include "sherlock/tagged-text.h"
#include "lib/unicode.h"
#include "charset/unicat.h"
#include "indexer/indexer.h"
#include "indexer/lexicon.h"
#include "indexer/params.h"

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <fcntl.h>

#undef PROFILE_TSC
#include "lib/profile.h"
static prof_t pr_fetch, pr_map, pr_lex, pr_comp;

#define LH_MKLEX
#include "indexer/lexhash.h"

static enum word_class
lm_lookup(enum word_class orig_class, word *uni, uns ulen, word_id_t *idp)
{
  struct verbum *v;

  if (orig_class != WC_NORMAL)
    return orig_class;
  prof_switch(&pr_map, &pr_lex);
  v = lh_lookup(uni, ulen);
  v->u.count++;
  *idp = v;
  prof_switch(&pr_lex, &pr_map);
  return v->id & 7;
}

static inline void
lm_got_word(uns pos UNUSED, uns cat UNUSED, word_id_t w UNUSED)
{
}

static inline void
lm_got_complex(uns pos UNUSED, uns cat UNUSED, word_id_t wroot UNUSED, word_id_t wcont UNUSED, uns dir UNUSED)
{
}

#include "indexer/lexmap.h"

static inline void
mklex_meta(byte *x)
{
  lm_doc_start();
  if (*x >= '0' && *x <= '3')
    x++;
  lm_map_text(x, x + str_len(x));
}

static void
mklex_metas(struct odes *o)
{
  for (struct oattr *a=obj_find_attr(o, 'M'); a; a=a->same)
    mklex_meta(a->val);
}

static void
mklex_reftexts(struct odes *o)
{
  for (struct oattr *a=obj_find_attr(o, 'x'); a; a=a->same)
    {
      byte *t = a->val;
      for (uns i=0; i<3; i++)
	while (*t++ != ' ')
	  ;
      mklex_meta(t);
    }
}

static void
mklex_card(struct card_attr *attr UNUSED, struct odes *o, struct card_hdr *hdr, struct card_note *note UNUSED, int bonus UNUSED)
{
  static uns ccnt;

  prof_switch(&pr_fetch, &pr_map);
  lm_doc_start();
  for (struct oattr *a=obj_find_attr(o, 'X'); a; a=a->same)
    lm_map_text(a->val, a->val + str_len(a->val));
  for (struct card_hdr *h=hdr; h; h=h->next)
  {
    mklex_metas(h->odes);
    for (struct card_hdr *r=h->redirects; r; r=r->next)
      mklex_metas(r->odes);
  }
  mklex_metas(o);
  mklex_reftexts(o);
  prof_switch(&pr_map, &pr_fetch);
  ccnt++;
  PROGRESS(ccnt, "mklex: %d cards, %d words in %d buckets",
	   ccnt, lh_hash_count, lh_hash_size);
}

static inline void
lex_write_verbum(struct fastbuf *b, struct verbum *l)
{
  bputl(b, l->id);
  bputl(b, l->u.count);
  bput_context(b, 0);
  uns c = str_len(l->word);
  bputc(b, c);
  bwrite(b, l->word, c);
}

static void
lex_write(byte *name)
{
  struct fastbuf *b;

  b = bopen(name, O_WRONLY | O_CREAT | O_TRUNC, indexer_fb_size);
  bputl(b, lh_hash_count);
  LH_WALK(l)
    lex_write_verbum(b, l);
  bclose(b);
}

static void
write_params(void)
{
  struct index_params params;
  struct fastbuf *b = index_bopen(fn_parameters, O_RDWR);

  breadb(b, &params, sizeof(params));
  memcpy(&params.lex_config, &lexicon_config, sizeof(lexicon_config));
  brewind(b);
  bwrite(b, &params, sizeof(params));
  bclose(b);
}

int
main(int argc, char **argv)
{
  log_init(argv[0]);
  setproctitle_init(argc, argv);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  lm_init();
  lh_init();
  write_params();

  prof_init(&pr_fetch);
  prof_init(&pr_map);
  prof_init(&pr_lex);
  prof_init(&pr_comp);

  prof_start(&pr_fetch);
  log(L_INFO, "Creating lexicon");
  fetch_cards(mklex_card);
  prof_stop(&pr_fetch);

#ifdef PROFILER
  log(L_DEBUG, "Profile: fetch %s, map %s, lex %s, comp %s", PROF_STR(pr_fetch), PROF_STR(pr_map), PROF_STR(pr_lex), PROF_STR(pr_comp));
#endif

  lex_write(index_name(fn_lex_raw));
  log(L_INFO, "Built lexicon with %d words", lh_hash_count);
  return 0;
}
