/*
 *	Sherlock Indexer -- Lexical Mapping
 *
 *	(c) 2001--2005 Martin Mares <mj@ucw.cz>
 *	(c) 2001--2005 Robert Spalek <robert@ucw.cz>
 *
 *	You define:
 *	   - word_id_t: a type representing a single word
 *	     If word_id_t is a compound type, you have to define WORD_ID_NULL
 *	     and WORD_ID_DEFINED_P(x).
 *	   - LM_TRACK_TEXT if you want lexmap to keep track of position of
 *	     the words in the original text. Expects the text to contain
 *	     no URL brackets.
 *	   - LM_SEARCH if you are search/words.c
 *	   - LM_CARDS if you are search/cards.c
 *	You call:
 *	   - lm_init() before doing anything else
 *	   - lm_doc_start() at the start of each document
 *	   - lm_map_text() for each piece of document text
 *	   - lm_map_break() if you want to insert an explicit sentence break
 *	We call:
 *	   - lm_lookup() to look up and categorize a word
 *	   - lm_got_word() for each word found together with its position
 *	   - lm_got_complex() for each word complex
 */

#include "lib/chartype.h"
#include "lib/unicode.h"
#include "charset/unicat.h"
#include "indexer/alphabet.h"

#include <string.h>

/* Internal state variables */
static uns lm_current_cat;
static uns lm_pos;
static uns lm_garb_cnt;
static word_id_t lm_context_base, lm_last_word;
static uns lm_context_cat;

#ifdef LM_TRACK_TEXT
#define TRACK_TEXT(e) e
#else
#define TRACK_TEXT(e) do { } while(0)
#endif

#ifndef WORD_ID_NULL
#define WORD_ID_NULL (word_id_t)0
#define WORD_ID_DEFINED_P(x) (x)
#endif

static inline void
lm_doc_start(void)
{
  lm_garb_cnt = 0;
  lm_last_word = lm_context_base = WORD_ID_NULL;
  lm_current_cat = WT_TEXT;
  lm_pos = 0;
}

static inline void
lm_set_pos(uns pos)
{
  /* Beware that this does not restore other state varibles.  You should only
   * call this procedure after lm_doc_start().  */
  lm_pos = pos;
}

static inline uns
lm_get_pos(void)
{
  return lm_pos;
}

static inline void
lm_map_word(word *uni, uns ulen, enum word_class class, byte *ostart UNUSED, uns olen UNUSED)
{
  word_id_t thisw = WORD_ID_NULL;

#ifdef LM_TRACK_TEXT
  class = lm_lookup(class, uni, ulen, &thisw, ostart, olen);
#else
  class = lm_lookup(class, uni, ulen, &thisw);
#endif
#ifdef LOCAL_DEBUG
  {
    byte w[3*MAX_WORD_LEN+1], *p=w;
    uns i;
    for (i=0; i<ulen; i++)
      PUT_UTF8(p, uni[i]);
    *p = 0;
    DBG("step: @%d <%s> class=%d cat=%d", lm_pos, w, class, lm_current_cat);
  }
#endif
  /*
   *  This complex automaton parses sequences of word classes, recognizes
   *  word complexes (words with context) and calculates word positions.
   */
  switch (class)
    {
    case WC_IGNORED:
    case WC_COMPLEX:
      break;
    case WC_NORMAL:
      if (lm_garb_cnt)
	{
	  if (lm_pos)
	    lm_pos += MIN(lm_garb_cnt, lex_max_gap);
	  lm_garb_cnt = 0;
	}
      else if (WORD_ID_DEFINED_P(lm_context_base))
	lm_got_complex(lm_pos-1, lm_context_cat, lm_context_base, thisw, 1);
      lm_context_base = WORD_ID_NULL;
      lm_got_word(lm_pos, lm_current_cat, thisw);
      lm_last_word = thisw;
      lm_pos++;
      break;
    case WC_BREAK:
      lm_garb_cnt += lex_max_gap;
      /* Fall thru */
    case WC_GARBAGE:
      lm_garb_cnt++;
      lm_context_base = lm_last_word = WORD_ID_NULL;
      break;
    case WC_CONTEXT:
      if (lm_garb_cnt)
	{
	  /* because of "... gap gap gap complex [complex] word" */
	  if (lm_pos)
	    lm_pos += MIN(lm_garb_cnt, lex_max_gap);
	  lm_garb_cnt = 0;
	}
      else
        {
	  if (WORD_ID_DEFINED_P(lm_context_base))
	    lm_got_complex(lm_pos-1, lm_current_cat, lm_context_base, thisw, 1);
	  if (WORD_ID_DEFINED_P(lm_last_word))
	    lm_got_complex(lm_pos, lm_current_cat, thisw, lm_last_word, 0);
	}
      lm_context_base = lm_last_word = thisw;
      lm_context_cat = lm_current_cat;
      lm_pos++;
      break;
    default:
      ASSERT(0);
    }
}

static inline void
lm_map_break(void)
{
  lm_map_word(NULL, 0, WC_BREAK, NULL, 0);
}

#define	ANALYSED_CHARS 0x91
enum char_category { cc_lower, cc_upper, cc_digit, cc_base, cc_ctrl, cc_ox90, cc_ok, cc_end };
static byte char_analysis[ANALYSED_CHARS];

static void
lm_init(void)
{
  alphabet_init();
  for (uns c=0; c<ANALYSED_CHARS; c++)
    {
      uns i;
      if (c<=0x20 || c>=0x80 && c!=0x90)
	i = cc_ok;
      else if (Clower(c))
	i = cc_lower;
      else if (Cupper(c))
	i = cc_upper;
      else if (Cdigit(c))
	i = cc_digit;
      else if (c == 0x90)
	i = cc_ox90;
      else if (c == '+' || c == '/' || c == '=')
	i = cc_base;
      else
	i = cc_ctrl;
      char_analysis[c] = i;
    }
}

static inline int
lm_sequence_valid(word *uni, uns len)
{
  byte cnts[cc_end];
  uns c, act_cat, i;
  uns last_cat = cc_ok;
  uns cat_changes = 0;
  byte *msg;

  /* Check for non-ASCII characters, if present, the string is surely NOT some
   * encoded string (base64, uuencode, base85, base32, xxencode, binhex, btoa).
   * */
  bzero(&cnts, sizeof(cnts));
  if (len >= lex_max_ctrl_len)
    for (i=0; c = uni[i]; i++)
      {
	c = uni[i];
	if (c >= ANALYSED_CHARS)
	  return 1;
	act_cat = char_analysis[c];
	cnts[act_cat] = 1;
	if (act_cat != last_cat)
	  cat_changes++;
	last_cat = act_cat;
      }

  if (cnts[cc_upper] + cnts[cc_lower] + cnts[cc_digit] >= 3 && !cnts[cc_ctrl] && !cnts[cc_ox90]
      && cat_changes >= len/4)			/* base64, base is common */
    msg = "base64";
  else if (cnts[cc_upper] + cnts[cc_digit] + (cnts[cc_base]|cnts[cc_ctrl]) >= 3 && !cnts[cc_lower] && !cnts[cc_ox90]
	   && cat_changes >= len/3)			/* uuencode */
    msg = "uuencode";
  else if (cnts[cc_upper] + cnts[cc_lower] + cnts[cc_digit] + (cnts[cc_base]|cnts[cc_ctrl]) + cnts[cc_ox90] >= 4
	   && cat_changes >= len/2)			/* binhex, btoa, base85 */
    msg = "base85";
  else
    return 1;

  DBG("Throwing out word of length %d with %d category changes containing low/upp/dig/bas/ctr/x90=%d/%d/%d/%d/%d/%d, judged as %s",
      len, cat_changes, cnts[cc_lower], cnts[cc_upper], cnts[cc_digit], cnts[cc_base], cnts[cc_ctrl], cnts[cc_ox90], msg);
  return 0;
}

static inline void
lm_map_sequence(word *uni, uns len, byte **cpos UNUSED)
{
  uns start, u, flags, i, wlen;
  enum word_class class;

  if (!len)
     return;

#ifndef LM_SEARCH
  uni[len] = 0;
  if (!lm_sequence_valid(uni, len))
    {
      lm_map_word(NULL, 0, WC_GARBAGE, NULL, 0);
      return;
    }
#endif

  start = flags = 0;
  uni[len] = ' ';
  uni[len+1] = 0;
  for (i=0; u = uni[i]; i++)
    {
      uns cat = alpha_class[u];
#if defined(LM_SEARCH) && defined(LM_SEARCH_WILDCARDS)
      if ((u == '*' && wildcard_asterisks) || (u == '?' && wildcard_qmarks))
	cat = AC_ALPHA;
#endif
      switch ((enum alphabet_class) cat)
	{
	case AC_ALPHA:
	case AC_DIGIT:
	  flags |= 1 << cat;
	  break;
	case AC_SINGLETON:
	default:
	  wlen = i - start;
	  if (wlen)
	    {
	      uns max = lex_max_len;
	      if (flags == ((1 << AC_ALPHA) | (1 << AC_DIGIT)))
		max = lex_max_mixed_len;
	      else if (flags == (1 << AC_DIGIT))
		max = lex_max_num_len;
	      if (wlen > max)
		class = WC_GARBAGE;
	      else
		class = WC_NORMAL;
#ifdef LM_TRACK_TEXT
	      lm_map_word(uni+start, i-start, class, cpos[start], cpos[i]-cpos[start]);
#else
	      lm_map_word(uni+start, i-start, class, NULL, 0);
#endif
	    }
	  if (cat == AC_SINGLETON)
#ifdef LM_TRACK_TEXT
	    lm_map_word(uni+i, 1, WC_NORMAL, cpos[i], cpos[i+1]-cpos[i]);
#else
	    lm_map_word(uni+i, 1, WC_NORMAL, NULL, 0);
#endif
	  start = i+1;
	  flags = 0;
	}
    }
  if (alpha_class[uni[len-1]] == AC_BREAK)
    lm_map_break();
}

#ifdef LM_SEARCH

static int
lm_map_text(byte *text)
{
  word uni[MAX_WORD_LEN+1];
  uns u, l;

  l = 0;
  while (*text)
    {
      GET_UTF8(text, u);
      if (alpha_class[u] == AC_SPACE)
	{
	  lm_map_sequence(uni, l, NULL);
	  l = 0;
	}
      else if (l < MAX_WORD_LEN)
	uni[l++] = u;
      else
	return 0;
    }
  lm_map_sequence(uni, l, NULL);
  return 1;
}

#else

static void
lm_map_text(byte *text, byte *stop)
{
  word uni[MAX_WORD_LEN+1];
  byte *cpos[MAX_WORD_LEN+1];
  const word *lig;
  uns u, l;

  l = 0;
  while (text < stop)
    {
      TRACK_TEXT(cpos[l] = text);
      GET_TAGGED_CHAR(text, u);
    restart:
      if (u < 0x80000000)
	{
	  switch ((enum alphabet_class) alpha_class[u])
	    {
	    case AC_SPACE:
	      lm_map_sequence(uni, l, cpos);
	      l = 0;
	      break;
	    case AC_ALPHA:
	    case AC_DIGIT:
	    case AC_PUNCT:
	    case AC_BREAK:
	    case AC_SINGLETON:
	      if (l < MAX_WORD_LEN)
		uni[l++] = u;
	      else
		{
		over:
		  lm_map_word(NULL, 0, WC_GARBAGE, NULL, 0);
		  l = 0;
		  while (text < stop &&
			 ((u < 0x80000000 && alpha_class[u] != AC_SPACE && alpha_class[u] != AC_SINGLETON) ||
			  (u >= 0x80010000)))
		    {
		      TRACK_TEXT(cpos[0] = text);
		      GET_TAGGED_CHAR(text, u);
		    }
		  if (text < stop)
		    goto restart;
		}
	      break;
	    case AC_LIGATURE:
	      lig = Uexpand_lig(u);
	      TRACK_TEXT(byte *lig_start = cpos[l]);
	      while (*lig)
		{
		  if (l >= MAX_WORD_LEN)
		    goto over;
		  TRACK_TEXT(cpos[l] = lig_start);
		  uni[l++] = *lig++;
		}
	      break;
	    default: ASSERT(0);
	    }
	}
      else if (u < 0x80010000)		/* Word type tag, breaks words */
	{
	  lm_map_sequence(uni, l, cpos);
	  l = 0;
	  lm_current_cat = u & 0x0f;
	  if (u & 0x10)
	    lm_map_break();
	}
      /* else it's a bracket which we ignore */
    }
  TRACK_TEXT(cpos[l] = text);
  lm_map_sequence(uni, l, cpos);
}

#endif

#ifdef LM_CARDS

static int
lm_map_url(byte *text, byte *stop)
{
  word uni[MAX_WORD_LEN+1];
  byte *cpos[MAX_WORD_LEN+1];
  uns u, l;

  l = 0;
  while (text < stop)
    {
      TRACK_TEXT(cpos[l] = text);
      GET_UTF8(text, u);
      uns c = alpha_class[u];
      if (c == AC_SPACE || u=='/' || u=='?' || u=='&' || u=='.')
	{
	  lm_map_sequence(uni, l, cpos);
	  l = 0;
	}
      else if (l < MAX_WORD_LEN)
	uni[l++] = u;
      else
	return 0;
    }
  TRACK_TEXT(cpos[l] = text);
  lm_map_sequence(uni, l, cpos);
  return 1;
}

#endif
