/*
 *	Sherlock SubStrings Analyser -- Substring Search
 *
 *	(c) 2006 Pavel Charvat <pchar@ucw.cz>
 */

#undef LOCAL_DEBUG

#include "sherlock/sherlock.h"
#include "sherlock/object.h"
#include "analyser/analyser.h"
#include "lib/conf.h"
#include "lib/mempool.h"
#include "lib/fastbuf.h"
#include "lib/chartype.h"
#include "sherlock/tagged-text.h"
#include "sherlock/conf-parse.h"
#include "charset/unicat.h"

#include <string.h>

#define KMP_PREFIX(x) kmp_##x
#define KMP_CHAR u16
#define KMP_STATE_VARS u32 mask;
#define KMP_ADD_EXTRA_ARGS uns mask
#define KMP_ADD_INIT(kmp,src) DBG("Adding string %s with mask %d", src, mask)
#define KMP_ADD_NEW(kmp,src,s) s->u.mask |= mask
#define KMP_ADD_DUP KMP_ADD_NEW
#define KMP_BUILD_STATE(kmp,s) do{ if (s->next) s->u.mask |= s->next->u.mask; }while(0)
#define KMP_CONTROL_CHAR ':'
#define KMP_USE_UTF8
#define KMP_TOLOWER
#define KMP_USE_POOL cf_pool
#define KMP_WANT_SEARCH
#define KMPS_VARS uns phrase; uns mask;
#define KMPS_SOURCE struct fastbuf *
static const KMP_CHAR kmp_sentence_break[] = { ':', '.' };
#define KMPS_GET_CHAR(kmp,src,s) ({				\
  int result = 1;						\
  if (s->u.phrase)						\
    s->c = kmp_sentence_break[--s->u.phrase];			\
  else								\
    {								\
      uns w = bget_tagged_char(src);				\
      if ((int)w == EOF)					\
        result = 0;						\
      else if (w >= 0x80000000)					\
        {							\
          if ((w & 0xf0) == 0x90)				\
            s->u.phrase = 2;					\
          s->c = ':';						\
        }							\
      else							\
        {							\
          w = Utolower(w);					\
          if (!Ualpha(w))					\
            w = ':';						\
          s->c = w;						\
        }							\
      }								\
    result; })
#define KMPS_INIT(kmp,src,s) s->u.phrase = 0
#define KMPS_STEP(kmp,src,s) s->u.mask |= s->s->u.mask
#define KMPS_ADD_CONTROLS
#define KMPS_MERGE_CONTROLS
#include "lib/kmp.h"

static uns substr_attr;
static uns substr_mask = 1;
static clist strings;
static struct kmp_struct *kmp;

struct string_item {
  cnode n;
  byte *str;
  uns mask;
};

static byte *
string_item_init(struct string_item *p)
{
  p->str = NULL;
  p->mask = substr_mask;
  return NULL;
}

static byte *
substr_commit(void *p UNUSED)
{
  CF_JOURNAL_VAR(kmp);
  kmp = NULL;
  return NULL;
}

static struct cf_section string_item_config = {
  CF_TYPE(struct string_item),
  CF_INIT(string_item_init),
  CF_ITEMS {
    CF_STRING("String", PTR_TO(struct string_item, str)),
    CF_END
  }
};

static struct cf_section an_substr_config = {
  CF_COMMIT(substr_commit),
  CF_ITEMS {
    CF_USER("Attr", &substr_attr, &cf_type_attr),
    CF_UNS("Mask", &substr_mask),
    CF_LIST("Strings", &strings, &string_item_config),
    CF_END
  }
};

static void CONSTRUCTOR
an_substr_config_init(void)
{
  cf_declare_section("SubStrings", &an_substr_config, 0);
}

static void
an_substr_init(struct an_hook *h UNUSED)
{
  if (kmp || clist_empty(&strings))
    return;
  if (!substr_attr)
    die("Undefined SubStrings.Attr");
  kmp = cf_malloc_zero(sizeof(struct kmp_struct));
  kmp_init(kmp);
  CLIST_FOR_EACH(struct string_item *, n, strings)
    if (n->mask && n->str && *n->str)
      kmp_add(kmp, n->str, n->mask);
  DBG("Building a-substr KMP structure");
  kmp_build(kmp);
}

static int
an_substr_need(struct an_hook *h UNUSED, struct an_iface *ai)
{
  return obj_find_attr(ai->obj, substr_attr) == NULL;
}

static void
an_substr_analyse(struct an_hook *h UNUSED, struct an_iface *ai)
{
  struct kmp_search search;
  search.u.mask = 0;
  if (kmp)
  {
    if (ai->text)
      kmp_search(kmp, &search, ai->text);
    if (ai->metas)
      kmp_search(kmp, &search, ai->metas);
    if (search.u.mask)
      DBG("Found document with mask 0x%x", search.u.mask);
  }
  obj_set_attr_format(ai->obj, substr_attr, "%x", search.u.mask);
}

struct analyser an_substr = {
  .name = "substr",
  .init = an_substr_init,
  .need = an_substr_need,
  .analyse = an_substr_analyse,
  .need_mask = AN_NEED_TEXT | AN_NEED_METAS
};
