/*
 *	Sherlock Search Engine -- Word Index
 *
 *	(c) 1997-2003 Martin Mares <mj@ucw.cz>
 */

#undef LOCAL_DEBUG

#include "sherlock/sherlock.h"
#include "lib/lfs.h"
#include "lib/fastbuf.h"
#include "lib/unaligned.h"
#include "lib/mempool.h"
#include "lib/wildmatch.h"
#include "lib/hashfunc.h"
#include "lib/unicode.h"
#include "charset/unicat.h"
#include "search/sherlockd.h"
#include "indexer/lexicon.h"

#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <alloca.h>

#define TRACE (current_query->debug & DEBUG_WORDS)

/*** The lexicon: all known words, order is described in doc/file-formats ***/

static uns
word_unaccent_utf8(byte *w, byte *to)
{					/* Requires buffer of at least MAX_WORD_LEN+3 bytes */
  uns u;
  byte *stop = to + MAX_WORD_LEN;
  byte *buf = to;

  while (*w)
    {
      GET_UTF8(w, u);
      u = Uunaccent(u);
      PUT_UTF8(buf, u);
      if (buf >= stop)
	return 0;
    }
  *buf = 0;
  return buf - to;
}

static inline struct lex_entry *
lex_get(uns i)
{
  return current_dbase->lex_array[i];
}

static inline uns			/* Convert ID from lexicon format to our format */
lex_import_id(uns id)
{
  return id/8 - 1;
}

static inline uns			/* Convert ID from our format to lexicon format */
lex_export_id(uns pos)
{
  return 8*pos + 8 + lex_get(pos)->class;
}

static void
lex_extract(uns lex_id, byte *buf)
{					/* Requires buffer of >= MAX_WORD_LEN+1 bytes */
  struct lex_entry *l = lex_get(lex_id);
  ASSERT(l->length <= MAX_WORD_LEN);
  memcpy(buf, l->w, l->length);
  buf[l->length] = 0;
}

static void
lex_extract_noacc(uns lex_id, byte *buf)
{					/* Requires buffer of >= 2*MAX_WORD_LEN+1 bytes */
  struct lex_entry *l = lex_get(lex_id);
  uns u;
  byte *w, *we;

  w = l->w;
  we = w + l->length;
  ASSERT(l->length <= MAX_WORD_LEN);
  while (w < we)
    {
      GET_UTF8(w, u);
      u = Uunaccent(u);
      PUT_UTF8(buf, u);
    }
  *buf = 0;
}

int
contains_accents(byte *s)
{
  uns u;

  for(;;)
    {
      GET_UTF8(s, u);
      if (!u)
	return 0;
      if (u != Uunaccent(u))
	return 1;
    }
}

/*
 *  Find first lexicon entry greater or equal to the given word
 *  in accent-less ordering (it's primary order of the lexicon).
 */

static uns
lex_find_first(uns len, byte *key)
{
  int l, r, m, res;
  byte w[2*MAX_WORD_LEN+1];

  l = current_dbase->lex_by_len[len];
  r = current_dbase->lex_by_len[len+1] - 1;
  DBG("lex_find_first: len=%d, l=%d, r=%d", len, l, r);
  while (l < r)
    {
      m = (l+r)/2;
      lex_extract_noacc(m, w);
      res = strcmp(w, key);
      DBG("(%d,%d,%d) <%s> %c <%s>", l, r, m, w, (res ? (res < 0) ? '<' : '>' : '='), key);
      if (res < 0)
	l = m + 1;
      else
	r = m;
    }
  return l;
}

static int UNUSED
lex_find_exact(byte *w)
{
  byte wunacc[MAX_WORD_LEN+1], lex[2*MAX_WORD_LEN+1];
  uns id, len;

  if (!word_unaccent_utf8(w, wunacc))
    return -1;
  len = utf8_strlen(w);
  id = lex_find_first(len, wunacc);
  while (id < current_dbase->lex_by_len[len+1])
    {
      lex_extract_noacc(id, lex);
      if (strcmp(lex, wunacc))
	break;
      lex_extract(id, lex);
      if (!strcmp(lex, w))
	return id;
      id++;
    }
  return -1;
}

/*** Searching for complexes (simple array lookup) ***/

static inline uns
cplx_make_id(uns root_ctxt, uns context_ctxt, uns dir)
{
  ASSERT(root_ctxt < lex_context_slots && context_ctxt < lex_context_slots && dir < 2);
  if (dir)
    context_ctxt += lex_context_slots;
  return (root_ctxt << 16) | context_ctxt;
}

static inline uns
cplx_dissect_id(uns i, uns *root, uns *ctxt)
{
  *root = i >> 16;
  *ctxt = i & 0xffff;
  if (*ctxt >= lex_context_slots)
    {
      *ctxt -= lex_context_slots;
      return 1;
    }
  else
    return 0;
}

static inline struct lex_entry *
cplx_get(uns i)
{
  uns root = i >> 16;
  uns context = i & 0xffff;
  ASSERT(root < lex_context_slots && context < 2*lex_context_slots && current_dbase->cplx_array[root]);
  return current_dbase->cplx_array[root][context];
}

/*
 *  The exception hash: we need very fast access to the list of exceptional
 *  words (those with class != WC_NORMAL) as we need them for lexical mapping
 *  of cards.
 */

struct word_exc {
  struct database *db;
  byte len;
  byte class;
  byte word[1];
};

#define HASH_NODE struct word_exc
#define HASH_PREFIX(p) wexc_##p
#define HASH_KEY_COMPLEX(x) x db, x word, x len
#define HASH_KEY_DECL struct database *db, byte *word, uns len
#define HASH_WANT_FIND
#define HASH_WANT_NEW
#define HASH_CONSERVE_SPACE

#define HASH_GIVE_HASHFN
static inline uns
wexc_hash(struct database *db, byte *word, uns len)
{
  return (uns)db ^ hash_block(word, len);
}

#define HASH_GIVE_EQ
static inline uns
wexc_eq(struct database *d1, byte *w1, uns l1, struct database *d2, byte *w2, uns l2)
{
  return (d1 == d2 && l1 == l2 && !memcmp(w1, w2, l1));
}

#define HASH_GIVE_EXTRA_SIZE
static inline uns
wexc_extra_size(struct database *d UNUSED, byte *w UNUSED, uns l)
{
  return l;
}

#define HASH_GIVE_INIT_KEY
static inline void
wexc_init_key(struct word_exc *e, struct database *d, byte *w, uns l)
{
  e->db = d;
  e->len = l;
  e->class = 0;
  strcpy(e->word, w);
}

#define HASH_AUTO_POOL 4096

#include "lib/hashtable.h"

static void
word_exc_init(void)
{
  wexc_init();
}

static void
word_exc_add(struct database *db, byte *word, uns len, enum word_class class)
{
  byte buf[MAX_WORD_LEN+1], w[MAX_WORD_LEN+3];
  uns l;
  struct word_exc *e;

  memcpy(buf, word, len);
  buf[len] = 0;
  if (!(l = word_unaccent_utf8(buf, w)))
    return;
  if (e = wexc_find(db, w, l))
    {
      if (e->class != class)
	log(L_ERROR, "Inconsistent lexicon: when adding <%s> <%s>, class %d != %d", buf, w, e->class, class);
    }
  else
    {
      e = wexc_new(db, w, l);
      e->class = class;
    }
}

int
word_classify(struct database *db, byte *word)
{
  byte w[MAX_WORD_LEN+3];
  uns l;
  struct word_exc *e;

  if (!(l = word_unaccent_utf8(word, w)))
    return WC_GARBAGE;
  if (e = wexc_find(db, w, l))
    return e->class;
  return -1;
}

/*** Loading of lexicon: words and complexes ***/

static void
lex_load(struct database *db)
{
  uns i, ecount=0, last_len;
  byte *lex, *lex_end;

  lex = mmap_file(db->fn_lexicon, &db->lexicon_file_size, 0);
  if (db->lexicon_file_size < 8)
    die("Corrupted lexicon %s", db->fn_lexicon);
  lex_end = lex + db->lexicon_file_size;
  db->lexicon_words = ((u32*)lex)[0];
  db->lexicon_complexes = ((u32*)lex)[1];

  db->lex_array = xmalloc(sizeof(struct lex_entry *) * (db->lexicon_words + HARD_MAX_WORDS));
#ifdef CONFIG_CONTEXTS
  byte ct_flags[lex_context_slots];
  bzero(ct_flags, sizeof(ct_flags));
#endif
  lex += 8;
  last_len = 0;
  for (i=0; i<db->lexicon_words; i++)
    {
      struct lex_entry *l = (struct lex_entry *) lex;
      uns len = utf8_strnlen(l->w, l->length);
      while (last_len <= len)
	db->lex_by_len[last_len++] = i;
      db->lex_array[i] = l;
      switch (l->class)
	{
	case WC_NORMAL:
	  if (len < lex_min_len)
	    {
	      word_exc_add(db, l->w, l->length, l->class);
	      ecount++;
	    }
	  break;
	case WC_COMPLEX:
	  ASSERT(0);
	case WC_CONTEXT:
#ifdef CONFIG_CONTEXTS
	  {
	    uns ctxt = GET_CONTEXT(&l->ctxt);
	    ASSERT(ctxt < lex_context_slots);
	    ct_flags[ctxt] = 1;
	  }
#else
	  ASSERT(0);
#endif
	  /* fall-thru */
	default:
	  word_exc_add(db, l->w, l->length, l->class);
	  ecount++;
	}
      lex += sizeof(struct lex_entry) + l->length;
    }
  while (last_len <= MAX_WORD_LEN+1)
    db->lex_by_len[last_len++] = i;
  while (i < db->lexicon_words + HARD_MAX_WORDS)
    {
      /* We use HARD_MAX_WORDS positions after the last real word for temporary words in no particular order */
      db->lex_array[i++] = NULL;
    }

#ifdef CONFIG_CONTEXTS
  if (db->lexicon_complexes)
    {
      db->cplx_array = xmalloc(sizeof(struct lex_entry **) * lex_context_slots);
      i = 0;
      for (uns ct=0; ct<lex_context_slots; ct++)
	if (ct_flags[ct])
	  {
	    struct lex_entry **ca = db->cplx_array[ct] = xmalloc(sizeof(struct lex_entry *) * 2*lex_context_slots);
	    for (uns j=0; j < 2*lex_context_slots; j++)
	      {
		struct lex_entry *l = (struct lex_entry *) lex;
		ASSERT(l->class == WC_COMPLEX && !l->length);
		ASSERT(GET_CONTEXT(&l->ctxt) == ct);
		ca[j] = l;
		lex += sizeof(struct lex_entry);
		i++;
	      }
	  }
      ASSERT(i == db->lexicon_complexes);
    }
#endif
  log(L_INFO, "Loaded word index %s: %d words, %d complexes, %d exceptions",
      db->name, db->lexicon_words, db->lexicon_complexes, ecount);
  ASSERT(lex == lex_end);
}

/*** The stem array ***/

#ifdef CONFIG_LANG

#include "lang/lang.h"

struct stem_block {
  cnode n;
  struct stemmer *stemmer;
  u32 *array;
  uns array_items;
};

struct syn_block {
  cnode n;
  u32 lang_mask;
  u32 *direct_array, *inverse_array;
  uns direct_items, inverse_items;
};

static struct mempool *stemmer_pool;

static void
stems_load(struct database *db)
{
  clist_init(&db->stem_block_list);
  clist_init(&db->syn_block_list);
  if (!db->fn_stems)
    return;
  u32 *ary = mmap_file(db->fn_stems, &db->stems_file_size, 0);
  u32 *ary_end = ary + db->stems_file_size/4;
  uns stem_block_count = 0, syn_block_count = 0;
  struct syn_block *open_syn_block = NULL;
  while (ary < ary_end)
    {
      u32 id = *ary++;
      u32 lmask = *ary++;
      u32 *astart = ary;
      while (*ary != ~0U)
	{
	  ASSERT(ary < ary_end);
	  ary++;
	}
      uns aitems = ary - astart;
      ary++;

      if (id < 0x80000000)
	{
	  struct stemmer *st;
	  struct stem_block *sb = NULL;
	  WALK_LIST(st, stemmer_list)
	    if (st->lang_mask == lmask && st->id == id)
	      {
		sb = xmalloc(sizeof(struct stem_block));
		sb->stemmer = st;
		sb->array = astart;
		sb->array_items = aitems;
		clist_add_tail(&db->stem_block_list, &sb->n);
		break;
	      }
	  if (sb)
	    {
	      stem_block_count++;
	      continue;
	    }
	}
      else if (id == 0x80000000 && !open_syn_block)
	{
	  open_syn_block = xmalloc(sizeof(struct syn_block));
	  open_syn_block->lang_mask = lmask;
	  open_syn_block->direct_array = astart;
	  open_syn_block->direct_items = aitems;
	  continue;
	}
      else if (id == 0x80000001 && open_syn_block)
	{
	  syn_block_count++;
	  ASSERT(open_syn_block->lang_mask == lmask);
	  open_syn_block->inverse_array = astart;
	  open_syn_block->inverse_items = aitems;
	  clist_add_tail(&db->syn_block_list, &open_syn_block->n);
	  open_syn_block = NULL;
	  continue;
	}
      log(L_ERROR, "Stemmer block (id=%08x,lm=%08x) unrecognized", id, lmask);
      open_syn_block = NULL;
    }
  if (open_syn_block)
    log(L_ERROR, "Incomplete synonymic block encountered");
  log(L_INFO, "Loaded word mappings: %d stemmer blocks, %d synonymic blocks", stem_block_count, syn_block_count);
  stemmer_pool = mp_new(4096);
}

static u32 *
stem_lookup_expansion(u32 *ary, uns nitems, u32 stem_id)
{
  int l, r, mstart, mstop;

  l = 0;
  r = nitems;
  while (l < r)
    {
      mstart = (l+r)/2;
      mstop = mstart+1;
      while (!(ary[mstart] & 0x80000000))
	mstart--;
      while (!(ary[mstop] & 0x80000000))
	mstop++;
      u32 key = ary[mstart] & 0x7fffffff;
      if (key == stem_id)
	return &ary[mstart];
      else if (key < stem_id)
	l = mstop;
      else
	r = mstart;
    }
  return NULL;
}

#endif

/*
 *  We represent each phrase by a list of ph_word's (corresponding to words
 *  written in the phrase), each of them pointing to a single struct word
 *  (we cannot use struct word directly since they can occur multiple times
 *  in a single phrase).
 *
 *  When we expand a word, its variants are represented by a list of ph_variant's
 *  connected to struct word.
 */

struct ph_word {
  cnode n;
  struct simple *simple;		/* Simple search component this came from */
  struct word *word;
  uns pos;				/* Relative position inside phrase */
  byte *unacc;				/* Unaccented version of the word */
  byte idx;				/* Word index corresponding to ph_word->word */
  byte magic;				/* Has been generated by magic transformations of simples */
  byte prox_after;			/* Followed by "*" */
  byte w[0];				/* Accented version of the word */
};

struct ph_variant {
  cnode n;
  uns lex_id;
  u32 lang_mask;
  byte noaccent_only;
  byte penalty;
  byte flags;				/* VF_xxx */
};

enum var_flags {
  VF_SOURCE_MASK = 0x1f,		/* These bits indicate source of the variant */
  VF_QUERY = 1,				/* Query word and its accent/wildcard expansions */
  VF_SYNTHETIC = 2,			/* Query word which isn't present in the lexicon, so we've synthesized a lex_entry for it */
  VF_MORPH = 4,				/* Generated by morphological expansion */
  VF_ACCENTS = 8,			/* Generated by non-initial accent expansion */
  VF_SYNONYMUM = 16,			/* Generated by synonymic expansion */
  VF_LEMMA = 32,			/* This variant is a lemma */
  VF_ACCENTIFIED = 64,			/* Searching for accent variants already applied to this word */
  VF_MORPHED = 128,			/* Morphological expansion already applied */
};

typedef struct ph_word *word_id_t;
static struct simple *ph_current_simple;
static clist *ph_current_list;
static struct ph_word *ph_word_for_current_star;

static void
word_dump_phrase(clist *phrase, char *comment)
{
  struct ph_word *p;
  struct ph_variant *v;
  byte buf[MAX_WORD_LEN+1];

  if (!TRACE)
    return;

  add_cr(".X %s phrase:", comment);
  CLIST_WALK(p, *phrase)
    {
      struct word *w = p->word;
      add_cr(".X\tpos=%d idx=%d class=%d stat=%d wild=%d magic=%d prox=%d '%s' '%s'",
	     p->pos, p->idx, w->word_class, w->status, w->is_wild, p->magic,
	     p->prox_after, p->w, p->unacc ? : (byte*)"");
      CLIST_WALK(v, w->variants)
	{
	  if (w->word_class == WC_COMPLEX)
	    {
	      uns root, ctxt;
	      if (cplx_dissect_id(v->lex_id, &root, &ctxt))
		sprintf(buf, "%04x (%04x)", root, ctxt);
	      else
		sprintf(buf, "(%04x) %04x", ctxt, root);
	    }
	  else
	    lex_extract(v->lex_id, buf);
	  byte flg[9];
	  strcpy(flg, "QSMAYLam");
	  for (uns i=0; i<8; i++)
	    if (!(v->flags & (1 << i)))
	      flg[i] = '-';
	  add_cr(".X\t\t<%s> noacc=%d lmask=%08x pen=%d flags=%s", buf, v->noaccent_only, v->lang_mask, v->penalty, flg);
	}
    }
}

static inline int
word_variant_subset(struct ph_variant *a, struct ph_variant *b)
{
  return
    (a->lang_mask & b->lang_mask) == a->lang_mask &&
    a->noaccent_only >= b->noaccent_only &&
    a->penalty >= b->penalty;
}

static int
word_add_variant(struct word *w, uns lex_id, uns noaccent_only, u32 lang_mask, uns penalty, uns flags)
{
  struct ph_variant *v, *x;

  if (w->var_count >= max_word_matches)
    {
      w->status = 105;
      return 0;
    }
  if (w->word_class != WC_COMPLEX && w->word_class != lex_get(lex_id)->class)
    {
      DBG("word_add_variant: class clash, ignoring");
      return 1;
    }

  v = mp_alloc(current_query->pool, sizeof(struct ph_variant));
  v->lex_id = lex_id;
  v->noaccent_only = noaccent_only;
  v->lang_mask = lang_mask;
  v->penalty = penalty;
  v->flags = flags;

  CLIST_WALK(x, w->variants)
    if (x->lex_id == v->lex_id)
      {
	/* If one of the variants implies the other, merge them */
	if (word_variant_subset(v, x))
	  {
	    x->flags |= v->flags;
	    return 1;
	  }
	else if (word_variant_subset(x, v))
	  {
	    v->flags |= x->flags;
	    clist_insert_before(&v->n, &x->n);
	    clist_remove(&x->n);
	    return 1;
	  }
      }
    else if (x->lex_id > v->lex_id)
      break;

  w->var_count++;
  clist_insert_before(&v->n, &x->n);
  return 1;
}

static uns
word_synthesize(uns idx, byte *wd, uns class)
{
  uns len = strlen(wd);
  struct lex_entry *l = mp_alloc_zero(current_query->pool, sizeof(struct lex_entry) + len);
  l->class = class;
  l->length = len;
  memcpy(l->w, wd, len);
  uns id = current_dbase->lexicon_words + idx;
  current_dbase->lex_array[id] = l;
  return id;
}

/*** Lexical mapping of phrases ***/

static uns
translate_accent_mode(struct ph_word *p, uns acc)
{
  switch (acc)
    {
    case ACCENT_AUTO:
      return (current_query->contains_accents ? ACCENT_AUTO : ACCENT_STRIP);
    case ACCENT_STRIP:
    case ACCENT_STRICT:
      return acc;
    case ACCENT_AUTO_LOCAL:
      return (strcmp(p->w, p->unacc) ? ACCENT_AUTO : ACCENT_STRIP);
    default:
      ASSERT(0);
    }
}

static enum word_class
lm_lookup(enum word_class orig_class, word *uni, uns ulen, word_id_t *thisw)
{
  struct ph_word *p;
  byte wbuf[2*MAX_WORD_LEN+1], *wp=wbuf;
  byte ubuf[MAX_WORD_LEN+1], *up=ubuf;
  uns u, wl, ul, i;
  struct word *w;
  struct expr *expr = ph_current_simple->raw;

  if (!uni)
    {
      ph_word_for_current_star = NULL;
      return orig_class;
    }
  for (i=0; i<ulen; i++)
    {
      u = uni[i];
      u = Utolower(u);
      PUT_UTF8(wp, u);
      u = Uunaccent(u);
      PUT_UTF8(up, u);
    }
  *wp = *up = 0;
  wl = wp - wbuf + 1;
  ul = up - ubuf + 1;
  p = mp_alloc_zero(current_query->pool, sizeof(struct ph_word) + wl + ul);
  memcpy(p->w, wbuf, wl);
  p->unacc = p->w + wl;
  memcpy(p->unacc, ubuf, ul);
  int old_acc = expr->u.match.o.accent_mode;
  expr->u.match.o.accent_mode = translate_accent_mode(p, old_acc);
  p->idx = lookup_word(current_query, expr, p->w);
  expr->u.match.o.accent_mode = old_acc;
  p->word = w = &current_query->words[p->idx];
  p->simple = ph_current_simple;
  if (orig_class != WC_NORMAL)
    w->word_class = orig_class;
  else if (ulen == 1 && uni[0] == '*')
    {
      w->word_class = WC_IGNORED;
      w->use_count--;
      if (ph_word_for_current_star)
	ph_word_for_current_star->prox_after = 1;
    }
  else
    {
      int c = word_classify(current_dbase, p->w);
      if (c < 0)
	{
	  if (ulen < lex_min_len_ign)
	    c = WC_IGNORED;
	  else if (ulen < lex_min_len)
	    c = WC_GARBAGE;
	  else
	    c = WC_NORMAL;
	}
      w->word_class = c;
      if (strchr(p->w, '*') || strchr(p->w, '?'))
	w->is_wild = 1;
      ph_word_for_current_star = p;
    }
  if (w->word_class == WC_NORMAL)
    w->cover_count++;
  else
    w->status = 116;
  p->pos = ~0U;
  add_hilited_word(current_query, p->w); /* record it even if it isn't indexed */
  *thisw = p;
  return w->word_class;
}

static void
lm_got_word(uns pos, uns cat UNUSED, word_id_t p)
{
  if (p->pos != ~0U)
    clist_remove(&p->n);
  clist_add_tail(ph_current_list, &p->n);
  p->pos = pos;
}

static inline void
lm_fixup_pos(word_id_t w, uns pos)
{
  if (w->pos != ~0U)
    clist_remove(&w->n);
  w->pos = pos;
  clist_add_tail(ph_current_list, &w->n);
}

static void
lm_got_complex(uns pos, uns cat UNUSED, word_id_t root, word_id_t context, uns dir)
{
  if (!dir)
    {
      word_id_t x = root;
      root = context;
      context = x;
      pos--;
    }
  lm_fixup_pos(root, pos);
  lm_fixup_pos(context, pos+1);
}

#define LM_SEARCH
#include "indexer/lexmap.h"

#ifdef CONFIG_LANG

#define QUERY_LANGS current_query->lang_set

/*** Second pass of accent expansion (after morphological or synonymic expansion) ***/

static void
word_expand_post_acc(struct ph_word *p, uns require_flags)
{
  struct ph_variant *v;
  struct word *w = p->word;

  if (p->word->options.accent_mode == ACCENT_STRICT)
    return;

  CLIST_WALK(v, w->variants)
    if (!(v->flags & VF_ACCENTIFIED) && (v->flags & require_flags))
      {
	byte wa[MAX_WORD_LEN+1], wu[2*MAX_WORD_LEN+1], la[MAX_WORD_LEN+1], lu[2*MAX_WORD_LEN+1];
	v->flags |= VF_ACCENTIFIED;
	lex_extract(v->lex_id, wa);
	lex_extract_noacc(v->lex_id, wu);
	uns chars = utf8_strlen(wu);
	for (uns idx = lex_find_first(chars, wu); idx < current_dbase->lex_by_len[chars+1]; idx++)
	  {
	    if (idx == v->lex_id)
	      continue;
	    lex_extract(idx, la);
	    lex_extract_noacc(idx, lu);
	    if (strcmp(lu, wu))
	      break;
	    uns nonacc_only = 0;
	    uns penalty = v->penalty;
	    switch (p->simple->raw->u.match.o.accent_mode)	/* Use the non-translated accent mode */
	      {
	      case ACCENT_STRIP:
		break;
	      case ACCENT_AUTO:
	      case ACCENT_AUTO_LOCAL:
		if (!strcmp(la, lu))
		  {
		    nonacc_only = 1;
		    penalty += misaccent_penalty;
		  }
		break;
	      default:
		ASSERT(0);
	      }
	    ASSERT(lex_get(idx)->class == w->word_class);
	    if (!word_add_variant(w, idx, nonacc_only, v->lang_mask, penalty,
				  v->flags | VF_ACCENTS | VF_ACCENTIFIED))
	      return;
	  }
      }
}

/*** Morphologic expansion ***/

static void
word_expand_morph_variant(struct ph_word *p, struct ph_variant *v)
{
  struct word *w = p->word;
  struct stem_block *sb;
  byte wa[MAX_WORD_LEN+1];
  struct word_node *stem;
  list *stems;
  clist *sb_list = &current_dbase->stem_block_list;
  uns debug = TRACE;
  int stem_id;
  u32 *exp;

  lex_extract(v->lex_id, wa);
  CLIST_WALK(sb, *sb_list)
    {
      u32 sb_lang_mask = sb->stemmer->lang_mask;
      if (!(sb_lang_mask & QUERY_LANGS))
	continue;
      mp_flush(stemmer_pool);
      if (stems = lang_stem(sb->stemmer, wa, stemmer_pool))
	WALK_LIST(stem, *stems)
	  {
	    if ((stem_id = lex_find_exact(stem->w)) >= 0)
	      {
		if (debug)
		  add_cr(".M <%s> -> <%s> (langs %x)", wa, stem->w, sb_lang_mask);
		add_cr("l%s %s", stem->w, wa);
		if (stem_id != (int) v->lex_id && lex_get(stem_id)->class == w->word_class)
		  word_add_variant(p->word, stem_id, 0, v->lang_mask & sb_lang_mask,
				   v->penalty + stem_penalty, VF_MORPHED | VF_MORPH | VF_LEMMA);
		if (exp = stem_lookup_expansion(sb->array, sb->array_items, lex_export_id(stem_id)))
		  {
		    while (!(*++exp & 0x80000000))
		      {
			uns idp = lex_import_id(*exp);
			if (debug)
			  {
			    byte ww[MAX_WORD_LEN+1];
			    lex_extract(idp, ww);
			    add_cr(".M\t-> <%s>", ww);
			  }
			if (idp != v->lex_id && lex_get(idp)->class == w->word_class)
			  word_add_variant(w, idp, 0, v->lang_mask & sb_lang_mask,
					   v->penalty + morph_penalty, VF_MORPHED | VF_MORPH);
		      }
		  }
	      }
	    else if (debug)
	      add_cr(".M <%s> -> <%s> (langs %x) unknown", wa, stem->w, sb_lang_mask);
	  }
    }
}

static void
word_expand_morph(struct ph_word *p)
{
  struct ph_variant *v, *tmp;

  if (!p->word->options.morphing)
    return;

  CLIST_WALK_DELSAFE(v, p->word->variants, tmp)
    if (!(v->flags & VF_MORPHED) && !v->noaccent_only)
      {
	v->flags |= VF_MORPHED;
	word_expand_morph_variant(p, v);
      }
  if (p->word->options.morphing > 1)
    word_expand_post_acc(p, VF_MORPH);
}

/*** Synonymic expansion ***/

struct syn_var {
  uns id;
  u32 lmask;
  struct ph_variant *orig;
};

static struct syn_var *syn_vars;
static uns syn_n;

static void
word_expand_syn_variant(struct ph_variant *v)
{
  struct syn_block *s;

  CLIST_WALK(s, current_dbase->syn_block_list)
    {
      if (!(s->lang_mask & v->lang_mask))
	continue;
      u32 *invexp = stem_lookup_expansion(s->inverse_array, s->inverse_items, lex_export_id(v->lex_id));
      if (!invexp)
	continue;
      invexp++;
      while (!(*invexp & 0x80000000))
	{
	  uns class = *invexp++;
	  if (TRACE)
	    add_cr(".Y Found synonymic class %x", class);
	  u32 *exp = stem_lookup_expansion(s->direct_array, s->direct_items, class);
	  ASSERT(exp);
	  exp++;
	  while (!(*exp & 0x80000000))
	    {
	      uns syn = lex_import_id(*exp++);
	      if (syn != v->lex_id && syn_n < HARD_MAX_SYNONYMA)
		{
		  uns i;
		  for (i=0; i<syn_n && syn_vars[i].id != syn; i++)
		    ;
		  if (i < syn_n)
		    syn_vars[i].lmask |= s->lang_mask & v->lang_mask;
		  else
		    syn_vars[syn_n++] = (struct syn_var){
		      .id = syn,
		      .lmask = s->lang_mask & v->lang_mask,
		      .orig = v
		    };
		}
	    }
	}
    }
}

static void
word_expand_synonyma(struct ph_word *p)
{
  struct ph_variant *v, *tmp;

  if (!p->word->options.synonyming)
    return;

  syn_n = 0;
  syn_vars = alloca(HARD_MAX_SYNONYMA * sizeof(struct syn_var));
  CLIST_WALK_DELSAFE(v, p->word->variants, tmp)
    if (!(v->flags & VF_SYNONYMUM) &&
	((v->flags & VF_LEMMA) || !(v->flags & VF_MORPH)) &&
	!v->noaccent_only)
      word_expand_syn_variant(v);
  if (!syn_n)
    return;

  for (uns i=0; i<syn_n; i++)
    {
      struct syn_var *r = &syn_vars[i];
      byte worig[MAX_WORD_LEN+1], wsyn[MAX_WORD_LEN+1];
      lex_extract(r->orig->lex_id, worig);
      lex_extract(r->id, wsyn);
      add_cr("Y%s %s %s %d", wsyn, worig, p->word->word, i);
      if (p->word->options.synonyming >= 2 && (p->word->options.syn_expand & (1ULL << i)))
	word_add_variant(p->word, r->id, 0, r->lmask, r->orig->penalty + synonymum_penalty, VF_SYNONYMUM);
    }
  if (p->word->options.synonyming >= 3)
    word_expand_post_acc(p, VF_SYNONYMUM);
  if (p->word->options.synonyming >= 4)
    word_expand_morph(p);
}

#else
static inline void word_expand_morph(struct ph_word *p UNUSED) { }
static inline void word_expand_synonyma(struct ph_word *p UNUSED) { }
#define QUERY_LANGS ~0U
#endif

/*** Expansion of words to variants ***/

static void
word_expand(struct ph_word *p)
{
  struct word *w = p->word;
  byte lex[MAX_WORD_LEN+1], lexu[2*MAX_WORD_LEN+1];
  uns chars, idx, nonacc_only, penalty;

  chars = utf8_strlen(p->unacc);
  for (idx = lex_find_first(chars, p->unacc); idx < current_dbase->lex_by_len[chars+1]; idx++)
    {
      lex_extract(idx, lex);
      lex_extract_noacc(idx, lexu);
      if (strcmp(lexu, p->unacc))
	break;
      nonacc_only = penalty = 0;
      switch (w->options.accent_mode)
	{
	case ACCENT_STRIP:
	  break;
	case ACCENT_STRICT:
	  if (strcmp(lex, p->w))
	    {
	      DBG("= %d <%s> <%s> !strict", idx, lex, lexu);
	      continue;
	    }
	  break;
	case ACCENT_AUTO:
	  if (strcmp(lex, p->w))
	    {
	      if (strcmp(lex, lexu))
		{
		  DBG("= %d <%s> <%s> !misaccented", idx, lex, lexu);
		  continue;
		}
	      nonacc_only = 1;
	      penalty = misaccent_penalty;
	    }
	  break;
	default:
	  ASSERT(0);
	}
      DBG("= %d <%s> <%s> %d %s", idx, lex, lexu, lex_get(idx)->class, nonacc_only ? "[nonacc]" : "");
      ASSERT(lex_get(idx)->class == w->word_class);
      if (!word_add_variant(w, idx, nonacc_only, QUERY_LANGS, penalty, VF_QUERY | VF_ACCENTIFIED))
	return;
    }
  if (clist_empty(&w->variants))
    word_add_variant(w, word_synthesize(p->idx, p->w, w->word_class), 0, QUERY_LANGS, 0, VF_QUERY | VF_SYNTHETIC | VF_ACCENTIFIED);
}

static void
word_expand_wild(struct ph_word *p)
{
  struct word *w = p->word;
  int first, last, idx;
  uns accented, old, nonacc_only, pxlen;
  static struct mempool *wild_pool;
  struct wildpatt *patt, *pattu;
  byte *px;
  uns ignore_count = 0;

  /* Find non-wildcard prefix */
  px = p->unacc;
  pxlen = 0;
  while (*px != '*' && *px != '?')
    {
      uns u;
      GET_UTF8(px, u);
      pxlen++;
    }
  if (pxlen < min_wildcard_prefix_len)
    {
      w->status = 114;
      return;
    }

  /* Compile both accented and unaccented pattern */
  accented = strcmp(p->w, p->unacc);
  if (!wild_pool)
    wild_pool = mp_new(16384);
  else
    mp_flush(wild_pool);
  patt = wp_compile(p->w, wild_pool);
  if (accented)
    pattu = wp_compile(p->unacc, wild_pool);
  else
    pattu = patt;
  if (!patt || !pattu)
    {
      w->status = 111;
      return;
    }

  uns total_zone_len = 0;
  for (uns ilen=pxlen; ilen<=MAX_WORD_LEN; ilen++)
    {
      /* For each length, find the lexicon zone */
      if (!pxlen)
	{
	  first = current_dbase->lex_by_len[ilen];
	  last = current_dbase->lex_by_len[ilen+1];
	}
      else
	{
	  old = *px;
	  *px = 0;
	  first = lex_find_first(ilen, p->unacc);
	  (*(px-1))++;			/* We know 0xff is not a valid character code */
	  last = lex_find_first(ilen, p->unacc);
	  (*(px-1))--;			/* We know 0xff is not a valid character code */
	  *px = old;
	}
      DBG("Zone for length %d: [%d,%d)", ilen, first, last);
      total_zone_len += last - first;
      if (total_zone_len > max_wildcard_zone)
	{
	  w->status = 113;
	  return;
	}

      /* Scan the zone */
      for (idx=first; idx < last; idx++)
	{
	  byte lex[MAX_WORD_LEN+1], lexu[2*MAX_WORD_LEN+1];

	  nonacc_only = 0;
	  lex_extract(idx, lex);
	  switch (w->options.accent_mode)
	    {
	    case ACCENT_STRIP:
	      lex_extract_noacc(idx, lexu);
	      if (!wp_match(pattu, lexu))
		{
		  DBG("* %d <%s> !strip", idx, lexu);
		  continue;
		}
	      break;
	    case ACCENT_STRICT:
	      if (!wp_match(patt, lex))
		{
		  DBG("* %d <%s> !strict", idx, lex);
		  continue;
		}
	      break;
	    case ACCENT_AUTO:
	      if (!wp_match(patt, lex))
		{
		  lex_extract_noacc(idx, lexu);
		  if (!wp_match(pattu, lexu))
		    {
		      DBG("* %d <%s> !auto", idx, lex);
		      continue;
		    }
		  if (strcmp(lex, lexu))
		    {
		      DBG("* %d <%s> <%s> !misaccented", idx, lex, lexu);
		      continue;
		    }
		  nonacc_only = 1;
		}
	      break;
	    default:
	      ASSERT(0);
	    }
#ifdef LOCAL_DEBUG
	  lex_extract_noacc(idx, lexu);
	  DBG("* %d <%s> <%s> %d %s", idx, lex, lexu, lex_get(idx)->class, nonacc_only ? "[nonacc]" : "");
#endif
	  if (lex_get(idx)->class == WC_NORMAL)
	    {
	      if (!word_add_variant(w, idx, nonacc_only, QUERY_LANGS, 0, VF_QUERY | VF_ACCENTIFIED))
		return;
	    }
	  else
	    ignore_count++;
	}
    }
  if (ignore_count && clist_empty(&w->variants))
    w->status = 116;
}

static void
word_expand_std(clist *phrase)
{
  struct ph_word *p;

  CLIST_WALK(p, *phrase)
    {
      struct word *w = p->word;
      if (!w->expanded)
	{
	  w->expanded = 1;
	  if (w->is_wild)
	    word_expand_wild(p);
	  else
	    {
	      word_expand(p);
	      word_expand_morph(p);
	      word_expand_synonyma(p);
	    }
	}
    }
}

/*** Processing of complexes ***/

#ifdef CONFIG_CONTEXTS

static inline uns
word_get_context(struct word *w, struct ph_variant *v)
{
  if (w->word_class == WC_COMPLEX)
    {
      uns root, ctxt;
      cplx_dissect_id(v->lex_id, &root, &ctxt);
      return root;
    }
  else
    {
      struct lex_entry *l = lex_get(v->lex_id);
      return GET_CONTEXT(&l->ctxt);
    }
}

static void
word_expand_complex(struct word *w, struct ph_word *root, struct ph_word *ctxt, uns dir)
{
  struct word *rw = root->word, *cw = ctxt->word;
  struct ph_variant *rv, *cv;

  CLIST_WALK(rv, root->word->variants)
    CLIST_WALK(cv, ctxt->word->variants)
      {
	uns noacc = rv->noaccent_only | cv->noaccent_only;
	uns lmask = rv->lang_mask & cv->lang_mask;
	if (!lmask)
	  continue;
	uns rc = word_get_context(rw, rv);
	uns cc = word_get_context(cw, cv);
	uns id = cplx_make_id(rc, cc, dir);
	word_add_variant(w, id, noacc, lmask, rv->penalty, 0);
      }
}

static struct ph_word *
do_make_complex(struct ph_word *root, struct ph_word *ctxt, uns dir)
{
  struct ph_word *p;
  struct word *w;
  byte *z;

  byte *rw = root->w;
  byte *cw = (ctxt->word->root ? : ctxt->word)->word;
  uns tl = strlen(rw) + strlen(cw) + 4;
  p = mp_alloc_zero(current_query->pool, sizeof(struct ph_word) + tl);
  z = p->w;
  if (!dir)
    {
      *z++ = '(';
      while (*cw)
	*z++ = *cw++;
      *z++ = ')';
      *z++ = ' ';
    }
  while (*rw)
    *z++ = *rw++;
  if (dir)
    {
      *z++ = ' ';
      *z++ = '(';
      while (*cw)
	*z++ = *cw++;
      *z++ = ')';
    }
  *z = 0;
  p->pos = root->pos;
  p->simple = root->simple;
  p->unacc = NULL;
  p->prox_after = (dir ? ctxt : root)->prox_after;
  DBG("Searching for complex <%s>", p->w);
  p->idx = lookup_word(current_query, p->simple->raw, p->w);
  p->word = w = &current_query->words[p->idx];
  if (!w->expanded)
    {
      w->expanded = 1;
      w->word_class = WC_COMPLEX;
      w->root = root->word;
      word_expand_complex(w, root, ctxt, dir);
    }
  return p;
}

static struct ph_word *
word_make_complex(struct ph_word *root, struct ph_word *ctxt, uns dir, uns do_magic)
{
  uns is_magic = 0;

  if (do_magic)
    {
      /* If we're in magic mode, we do the normal condensations since they are
       * needed for the near matcher and if magic_complexes is turned on, we mark
       * some of the complexes as magic, so that they will be inserted to the
       * simple by word_add_magic_complexes().
       */
      if (root->simple != ctxt->simple)
	{
	  if (!magic_complexes)
	    return NULL;

	  /* Combine matching senses. Rules:
	   * (1) Words have incompatible senses (+ vs. -) => throw away.
	   * (2) Use sense of the root [by setting p->simple]
	   */
	  int rs = root->simple->raw->u.match.sense;
	  int cs = ctxt->simple->raw->u.match.sense;
	  if (rs > 0 && cs < 0 || rs < 0 && cs > 0)
	    return NULL;

	  /* If the root part has been already covered, we don't need a magic complex.
	   * If it hasn't, let's go forth and decrease its use_count, so that it won't
	   * be reported as non-indexed.
	   */
	  if (root->word->cover_count)
	    return NULL;
	  root->word->use_count--;

	  is_magic = 1;
	}
    }
  else
    root->word->cover_count++;

  /* Look it up as a phrase and set the magic flag. Such a fragment can
   * arise multiple times in a single simple query, but we don't take
   * care of them, they will be merged properly by the boolean optimizer.
   */
  struct ph_word *p = do_make_complex(root, ctxt, dir);
  p->magic = is_magic;
  p->simple = root->simple;
  return p;
}

static void
word_find_complexes(clist *phrase, uns do_magic)
{
  struct ph_word *p, *r, *prev, *next, *new;

  CLIST_WALK_DELSAFE(p, *phrase, r)
    if (p->word->word_class == WC_CONTEXT)
      {
	prev = clist_prev(phrase, &p->n);
	next = clist_next(phrase, &p->n);
	new = NULL;
	if (next && (next->word->word_class == WC_NORMAL || !prev))
	  new = word_make_complex(p, next, 1, do_magic);
	if (!new && prev)
	  new = word_make_complex(p, prev, 0, do_magic);
	if (new)
	  {
	    clist_insert_before(&new->n, &p->n);
	    clist_remove(&p->n);
	    p->word->use_count--;
	  }
      }
    else
      ASSERT(p->word->word_class == WC_NORMAL);
}

#endif

/*** Transformation of query expressions for phrases ***/

static void
word_xform_expr(struct query *q, struct simple *s)
{
  struct expr *e = s->raw;
  struct expr *f;

  if (clist_empty(&s->phrase))		/* The phrase is empty */
    e = new_node(EX_IGNORE);
  else if (!clist_next(&s->phrase, clist_head(&s->phrase))) /* Just a single word */
    {
      struct ph_word *p = clist_head(&s->phrase);
      struct word *w = p->word;
      w->is_outer = 1;
      w->weight = MAX(w->weight, e->u.match.o.weight);
      e = new_node(EX_REF_WORD);
      e->u.ref.index = p->idx;
    }
  else
    {
      struct ph_word *p;
      uns firstpos = 0;
      uns lastpos = ~0U;
      uns is_prox = 0;
      if (q->nphrases >= max_phrases)
	{
	  add_cerr("-103 Too many phrases");
	  eval_err(103);
	}
      struct phrase *phrase = &q->phrases[q->nphrases++];
      bzero(phrase, sizeof(*phrase));
      phrase->weight = e->u.match.o.weight;
      e = new_node(EX_REF_PHRASE);
      e->u.ref.index = q->nphrases-1;
      CLIST_WALK(p, s->phrase)
	{
	  if (phrase->length >= MAX_PHRASE_LEN)
	    {
	      add_cerr("-110 Phrase too complex");
	      eval_err(110);
	    }
	  phrase->word[phrase->length] = p->idx;
	  if (!phrase->length)
	    firstpos = lastpos = p->pos;
	  phrase->relpos[phrase->length] = MIN(p->pos - lastpos, 4);
	  phrase->next_same_word[phrase->length] = phrase->word_to_idx[p->idx];
	  phrase->word_to_idx[p->idx] = phrase->length+1;
	  if (is_prox)
	    phrase->prox_map |= 1 << phrase->length;
	  lastpos = p->pos;
	  phrase->length++;
	  f = new_node(EX_REF_WORD);
	  f->u.ref.index = p->idx;
	  e = new_op(EX_AND, e, f);
	  is_prox = p->prox_after;
	}
    }
  s->cooked = e;
}

/*** Association of reference chains to words ***/

static void
word_find_refs(clist *phrase)
{
  struct ph_word *p;

  CLIST_WALK(p, *phrase)
    {
      struct word *w = p->word;
      struct ph_variant *v;
      if (w->expanded >= 2)
	continue;
      w->expanded = 2;
      CLIST_WALK(v, w->variants)
	{
	  struct ref_chain *ref = current_query->last_ref;
	  struct lex_entry *l;
#ifdef CONFIG_CONTEXTS
	  if (w->word_class == WC_COMPLEX)
	    l = cplx_get(v->lex_id);
	  else
#endif
	    l = lex_get(v->lex_id);
	  ref->u.file.start = GET_O(l->ref_pos);
	  ref->u.file.size = GET_U16(l->ch_len) << 12;
	  if (!ref->u.file.size)
	    continue;
	  ref->word_index = p->idx;
	  ref->noaccent_only = v->noaccent_only;
	  ref->penalty = v->penalty;
	  ref->lang_mask = v->lang_mask;
	  w->ref_count++;
	  w->ref_total_len += ref->u.file.size;
	  current_query->last_ref++;
	}
    }
}

/*** Analysis of a single word/phrase match ***/

static void
word_analyse(struct query *q, struct simple *s)
{
  ph_current_simple = s;
  ph_current_list = &s->phrase;
  ph_word_for_current_star = NULL;

  clist_init(&s->phrase);
  lm_doc_start();
  if (!lm_map_text(s->raw->u.match.word))
    {
      add_cerr("-115 Word too long");
      eval_err(115);
    }
  word_dump_phrase(&s->phrase, "Initial");
  word_expand_std(&s->phrase);
  word_dump_phrase(&s->phrase, "Expanded");
#ifdef CONFIG_CONTEXTS
  word_find_complexes(&s->phrase, 0);
  word_dump_phrase(&s->phrase, "Complexified");
#endif
  word_xform_expr(q, s);
  word_find_refs(&s->phrase);
}

/*** Magic transformations of simple queries ***/

#ifdef CONFIG_CONTEXTS
static void
word_add_magic_complexes(clist *magic, clist *simp)
{
  struct ph_word *p;

  CLIST_WALK(p, *magic)
    if (p->magic)
      {
	struct simple *s = mp_alloc_zero(current_query->pool, sizeof(struct simple));
	struct word *w = p->word;
	clist_add_tail(simp, &s->n);
	s->raw = p->simple->raw;
	s->cooked = new_node(EX_REF_WORD);
	s->cooked->u.ref.index = p->idx;
	w->is_outer = 1;
	w->weight = MAX(w->weight, s->raw->u.match.o.weight);
	w->use_count++;
      }
}
#endif

static void
word_add_magic_near(clist *magic)
{
  struct query *q = current_query;
  struct ph_word *pw, *pwlast;
  struct phrase *p;
  uns cnt = 0;
  uns lastpos = 0;

  if (q->nnears >= max_nears)
    return;
  pwlast = NULL;
  CLIST_WALK(pw, *magic)
    if (pw->simple->raw->u.match.sense >= 0)
      {
	if ((!pwlast || pwlast->simple != pw->simple) && !pw->magic)
	  cnt++;
	pwlast = pw;
      }
  if (cnt < 2)
    return;

  DBG("Adding near match for %d words", cnt);
  p = &q->nears[q->nnears++];
  bzero(p, sizeof(*p));
  CLIST_WALK(pw, *magic)
    if (pw->simple->raw->u.match.sense >= 0 && p->length < MAX_PHRASE_LEN)
    {
      uns i = p->length;
      p->word[i] = pw->idx;
      p->relpos[i] = MIN(pw->pos - lastpos, 4);
      lastpos = pw->pos;
      p->next_same_word[p->length] = p->word_to_idx[pw->idx];
      p->word_to_idx[pw->idx] = p->length+1;
      p->length++;
      pw->word->use_count++;
    }
}

static void
word_add_magic_merges(clist *magic, clist *simp)
{
  struct ph_word *p;
  struct simple *s;
  struct expr *e;
  byte w[MAX_WORD_LEN+1];
  uns len = 0;
  int wt = 0;
  uns cnt = 0;
  uns type_mask = magic_merge_classes;

  /* Require all words positive, without wildcards and no strict accents and merge non-accented variants */
  CLIST_WALK(p, *magic)
    {
      struct expr *e = p->simple->raw;
      uns l1 = strlen(p->unacc);
      if (e->u.match.sense < 0 ||
	  p->word->is_wild ||
	  strcmp(p->w, p->unacc) && e->u.match.o.accent_mode == ACCENT_STRICT)
	return;
      if (len + l1 > MAX_WORD_LEN)
	return;
      memcpy(w+len, p->unacc, l1);
      len += l1;
      wt = wt + e->u.match.o.weight;
      type_mask &= e->u.match.classmap;
      cnt++;
    }
  if (cnt < 2 || wt <= 0 || !type_mask)
    return;
  w[len] = 0;

  if (TRACE)
    add_cr(".U <%s>", w);

  e = new_node(EX_MATCH);
  e->u.match.o.weight = wt + magic_merge_bonus;
  e->u.match.o.accent_mode = ACCENT_STRICT;
  e->u.match.word = mp_alloc(current_query->pool, len+1);
  memcpy(e->u.match.word, w, len+1);
  e->u.match.classmap = type_mask;

  s = mp_alloc_zero(current_query->pool, sizeof(*s));
  s->raw = e;
  word_analyse(current_query, s);

  if (p = clist_head(&s->phrase))
    {
      p->word->hide_count++;
      clist_add_tail(simp, &s->n);
    }
}

static void
word_apply_magic(clist *simp)
{
  struct simple *s;
  clist phrase;
  struct ph_word *p;

  clist_init(&phrase);
  ph_current_list = &phrase;
  lm_doc_start();
  CLIST_WALK(s, *simp)
    if (!s->raw->u.match.is_string)
      {
	ph_current_simple = s;
	ph_word_for_current_star = NULL;
	lm_map_text(s->raw->u.match.word);
      }
  word_dump_phrase(&phrase, "Initial magic");
  if (magic_merge_words)
    word_add_magic_merges(&phrase, simp);
  if (magic_complexes || magic_near)
    {
      word_expand_std(&phrase);
      word_dump_phrase(&phrase, "Expanded");
#ifdef CONFIG_CONTEXTS
      word_find_complexes(&phrase, 1);
      word_dump_phrase(&phrase, "Complexified");
#endif
      word_find_refs(&phrase);
#ifdef CONFIG_CONTEXTS
      if (magic_complexes)
	word_add_magic_complexes(&phrase, simp);
#endif
      if (magic_near)
	word_add_magic_near(&phrase);
    }
  CLIST_WALK(p, phrase)
    p->word->use_count--;
}

/*** Main entry point: analysis of simple queries ***/

void
word_analyse_simple(struct query *q, clist *l)
{
  struct simple *s;
  uns i, cnt = 0;

  if (!q->dbase->lex_array)
    {
      CLIST_WALK(s, *l)
	if (!s->raw->u.match.is_string)
	  s->cooked = new_node(EX_NONE);
      return;
    }

  /* Reset per-word flag for the words already known */
  for (i=0; i<q->nwords; i++)
    q->words[i].cover_count = 0;

  /* Analyse individual words/phrases */
  CLIST_WALK(s, *l)
    if (!s->raw->u.match.is_string)
      {
	word_analyse(q, s);
	cnt++;
      }

  /* Scan the whole simple query as a phrase and perform all the magic tricks */
  if (cnt >= 2 && (magic_complexes || magic_near || magic_merge_words))
      word_apply_magic(l);
}

/*** Highlighting of all known word variants ***/

void
word_add_hilites(struct query *q, struct word *w)
{
  struct ph_variant *v;

  if (w->is_string || w->word_class == WC_COMPLEX)
    return;
  CLIST_WALK(v, w->variants)
    {
      byte buf[MAX_WORD_LEN+1];
      lex_extract(v->lex_id, buf);
      add_hilited_word(q, buf);
    }
}

/*** Spelling checker ***/

#ifdef CONFIG_SPELL

#define HARD_MAX_SPELLS 16

enum spell_found {
  SPELL_FOUND_ADD,
  SPELL_FOUND_DEL,
  SPELL_FOUND_MOD,
  SPELL_FOUND_XPOS
};

struct spell_best {
  uns id, unacc_id;
  int pts;
};

static struct spell_best spell_best[HARD_MAX_SPELLS];
static uns spell_max, spell_n, spell_threshold, spell_accent_mode;
static uns *spell_known_vars, spell_n_known_vars;
static byte *spell_wa;

static inline int
spell_diff_char(uns x, uns y)
{
  /* Calculate how much do two characters differ */
  if (x == y)
    return 0;
  x = Uunaccent(x);
  y = Uunaccent(y);
  if (x == y)
    return spell_accent_penalty;
  for (struct spell_pair *p=spell_common_pairs; p; p=p->next)
    if (x == p->x && y == p->y || x == p->y && y == p->x)
      return spell_common_penalty;
  return spell_mod_penalty;
}

static int PURE
spell_differs(struct lex_entry *l, uns li, uns oi, uns len)
{
  /* Calculate how much does a fragment of the original word and a fragment of the match differ */
  int diff = 0;
  byte *ow, *lw, *le;
  uns ou, lu;

  lw = l->w;
  le = lw + l->length;
  while (li--)
    UTF8_SKIP(lw);
  ow = spell_wa;
  while (oi--)
    UTF8_SKIP(ow);
  while (len-- && lw < le)
    {
      GET_UTF8(lw, lu);
      GET_UTF8(ow, ou);
      diff += spell_diff_char(lu, ou);
    }
  return diff;
}

static void
spell_found(uns id, uns unacc_id, uns pos, enum spell_found found_what)
{
  /* Word has been found by the spelling checker without accents. */
  struct lex_entry *l = lex_get(id);
  uns i;
  int pts;
  byte *msg;

  if (l->freq < spell_threshold)
    {
      msg = "freq < threshold";
      pts = 0;
      goto done;
    }

  /* Calculate similarity points */
  pts = l->freq * 100;
  switch (found_what)
    {
    case SPELL_FOUND_ADD:
      pts -= spell_add_penalty + spell_differs(l, 0, 0, pos-1) + spell_differs(l, pos, pos-1, ~0U);
      break;
    case SPELL_FOUND_DEL:
      pts -= spell_del_penalty + spell_differs(l, 0, 0, pos) + spell_differs(l, pos, pos+1, ~0U);
      break;
    case SPELL_FOUND_MOD:
      pts -= spell_differs(l, 0, 0, ~0U);
      break;
    case SPELL_FOUND_XPOS:
      pts -= spell_xpos_penalty + spell_differs(l, 0, 0, pos-2) + spell_differs(l, pos, pos, ~0U);
    }

  if (spell_n == spell_max && spell_best[spell_n-1].pts >= pts)
    msg = "clearly pessimal";
  else
    {
      for (i=0; i<spell_n_known_vars; i++)
	if (spell_known_vars[i] == id)
	  {
	    msg = "found in expansions";
	    goto done;
	  }
      msg = "recorded";
      for (i=0; i<spell_n; i++)
	if (spell_best[i].id == id ||
	    spell_accent_mode != 2 && spell_best[i].unacc_id == unacc_id)
	  {
	    if (pts < spell_best[i].pts)
	      {
		msg = "kept old match";
		goto done;
	      }
	    memmove(&spell_best[i], &spell_best[i+1], sizeof(struct spell_best) * (spell_n - i - 1));
	    spell_n--;
	    msg = "replaced old match";
	    break;
	  }
      for (i=0; i<spell_n && spell_best[i].pts > pts; i++)
	;
      memmove(&spell_best[i+1], &spell_best[i], sizeof(struct spell_best) * (spell_n - i + (spell_n < spell_max)));
      if (spell_n < spell_max)
	spell_n++;
      spell_best[i].id = id;
      spell_best[i].unacc_id = unacc_id;
      spell_best[i].pts = pts;
    }
 done:
  if (TRACE)
    {
      byte buf[MAX_WORD_LEN+1];
      lex_extract(id, buf);
      add_cr(".Z <%s> freq=%d pts=%d: %s", buf, l->freq, pts, msg);
    }
#ifdef LOCAL_DEBUG
  byte buf[MAX_WORD_LEN+1];
  lex_extract(id, buf);
  DBG("Found <%s> freq=%d pts=%d what=%d@%d: %s", buf, l->freq, pts, found_what, pos, msg);
#endif
}

static void
spell_extract_suffix(int id, uns prefix_len, byte *buf)
{
  struct lex_entry *l = lex_get(id);
  byte *w = l->w;
  byte *we = l->w + l->length;
  uns u;
  while (prefix_len--)
    UTF8_SKIP(w);
  while (w < we)
    {
      GET_UTF8(w, u);
      u = Uunaccent(u);
      PUT_UTF8(buf, u);
    }
  *buf = 0;
}

static void
spell_check_rest(int l, int r, uns prefix_len, byte *suffix, enum spell_found found_what)
{
  /* Given an interval [l,r] of words with a common prefix, process
   * all words which are equal to suffix after removing the prefix.
   */
  int r0 = r;
  byte buf[2*MAX_WORD_LEN+1];
  DBG("\t\t\tSearching [%d,%d] for <%s>, prefix_len=%d", l, r, suffix, prefix_len);
  while (l < r)		     /* Invariant: first word lies in [l,r] */
    {
      int m = (l+r)/2;
      spell_extract_suffix(m, prefix_len, buf);
      if (strcmp(buf, suffix) < 0)
	l = m+1;
      else
	r = m;
    }
  int l0 = l;
  while (l <= r0)
    {
      spell_extract_suffix(l, prefix_len, buf);
      if (strcmp(buf, suffix))
	break;
      spell_found(l, l0, prefix_len, found_what);
      l++;
    }
}

static inline uns
spell_nth_char(int id, uns n)
{
  byte *w = lex_get(id)->w;
  for (uns i=0; i<n; i++)
    UTF8_SKIP(w);
  uns u;
  GET_UTF8(w, u);
  return Uunaccent(u);
}

static int
spell_find_char(int l, int r, uns prefix_len, uns c)
{
  /* Given an interval [l,r] of words with a common prefix, find the
   * first word with the next character after the prefix >= c. Uses modified
   * binary search which combines doubling and halving to get better
   * performance on short distances.
   */
  int s = 1;
  while (l+s <= r && spell_nth_char(l+s, prefix_len) < c)
    s += s;
  /* Invariant: Result lies in [l,l+s], l+s might be out of range */
  while (s > 1)
    {
      s /= 2;
      if (l+s <= r && spell_nth_char(l+s, prefix_len) < c)
	l += s;
    }
  if (l <= r && spell_nth_char(l, prefix_len) < c)
    l++;
  return l;
}

static inline void
spell_restrict(int *ll, int *rr, uns prefix_len, uns nextc)
{
  /* Restrict interval [l,r] of words with a common prefix to those having
   * the prefix followed by char nextc.
   */
  *ll = spell_find_char(*ll, *rr, prefix_len, nextc);
  *rr = spell_find_char(*ll, *rr, prefix_len, nextc+1) - 1;
}

static inline int
spell_skip_char(int l, uns prefix_len, int r)
{
  /* Find the first word in interval [l,r] of words with a common prefix
   * which differs just after the prefix; r+1 if no such word exists.
   */
  uns u = spell_nth_char(l, prefix_len);
  return spell_find_char(l, r, prefix_len, u+1);
}

static void
spell_check_del(byte *wu, uns len)
{
  int l = current_dbase->lex_by_len[len-1];
  int r = current_dbase->lex_by_len[len]-1;
  DBG("check_del: length=%d, interval=[%d,%d]", len-1, l, r);
  for (uns i=0; i<len && l<=r; i++)
    {
      uns u;
      GET_UTF8(wu, u);
      DBG("\ti=%d (char %x) [%d,%d]", i, u, l, r);
      spell_check_rest(l, r, i, wu, SPELL_FOUND_DEL);
      spell_restrict(&l, &r, i, u);
    }
}

static void
spell_check_add(byte *wu, uns len)
{
  int l = current_dbase->lex_by_len[len+1];
  int r = current_dbase->lex_by_len[len+2]-1;
  DBG("check_add: length=%d, interval=[%d,%d]", len+1, l, r);
  for (uns i=0; i<=len && l<=r; i++)
    {
      byte *c = wu;
      uns u;
      GET_UTF8(wu, u);
      DBG("\ti=%d (char %x) [%d,%d]", i, u, l, r);
      int l1=l, r1;
      while (l1 <= r)
	{
	  r1 = spell_skip_char(l1, i, r);
	  spell_check_rest(l1, r1-1, i+1, c, SPELL_FOUND_ADD);
	  l1 = r1;
	}
      if (u)
	spell_restrict(&l, &r, i, u);
    }
}

static void
spell_check_mod(byte *wu, uns len)
{
  int l = current_dbase->lex_by_len[len];
  int r = current_dbase->lex_by_len[len+1]-1;
  DBG("check_mod: length=%d, interval=[%d,%d]", len, l, r);
  for (uns i=0; i<len; i++)
    {
      uns u;
      GET_UTF8(wu, u);
      DBG("\ti=%d (char %x) [%d,%d]", i, u, l, r);
      int l1=l, r1;
      while (l1 <= r)
	{
	  r1 = spell_skip_char(l1, i, r);
	  spell_check_rest(l1, r1-1, i+1, wu, SPELL_FOUND_MOD);
	  l1 = r1;
	}
      spell_restrict(&l, &r, i, u);
    }
}

static void
spell_check_xpos(byte *wu, uns len)
{
  int l = current_dbase->lex_by_len[len];
  int r = current_dbase->lex_by_len[len+1]-1;
  DBG("check_xpos: length=%d, interval=[%d,%d]", len, l, r);
  for (uns i=0; i<len-1; i++)
    {
      byte *t;
      uns u1, u2;
      t = wu;
      GET_UTF8(t, u1);
      GET_UTF8(t, u2);
      t = wu;
      PUT_UTF8(t, u2);
      PUT_UTF8(t, u1);
      spell_check_rest(l, r, i, wu, SPELL_FOUND_XPOS);
      t = wu;
      PUT_UTF8(t, u1);
      wu = t;
      PUT_UTF8(t, u2);
      spell_restrict(&l, &r, i, u1);
    }
}

static void
spell_word(uns idx)
{
  struct word *w = &current_query->words[idx];

  /* Calculate frequency and set thresholds */
  uns freq = 0;
  struct ph_variant *v;
  CLIST_WALK(v, w->variants)
    freq = MAX(freq, lex_get(v->lex_id)->freq);
  if (freq >= spell_good_freq || freq + spell_margin > 256)
    {
      if (TRACE)
	add_cr(".Z Spelling <%s> (freq %d): too frequent to consider", w->word, freq);
      return;
    }
  if (freq >= spell_dwarf)
    spell_threshold = freq + spell_margin;
  else
    spell_threshold = spell_dwarf_margin;

  /* Prepare unaccented word and check length */
  spell_wa = w->word;
  byte wu[MAX_WORD_LEN+3];
  if (!word_unaccent_utf8(spell_wa, wu))
    return;
  uns len = utf8_strlen(wu);
  if (len < spell_min_len)
    return;
  spell_accent_mode = w->options.accent_mode;

  if (TRACE)
    add_cr(".Z Spelling <%s> (freq %d): with freq_threshold=%d and accent_mode=%d", wu, freq, spell_threshold, spell_accent_mode);

  /* Check all possible variants with edit distance 1 */
  spell_check_del(wu, len);
  spell_check_add(wu, len);
  spell_check_mod(wu, len);
  /* Check transpositions */
  spell_check_xpos(wu, len);

  /* If the word is a dwarf, we should consider restarting the search with a lower threshold
   * if there are no matches, but we'll better do it in a single pass by clever filtering.
   */
  if (freq < spell_dwarf && spell_n > 0)
    {
      uns r = 0, w = 0;
      while (r < spell_n)
	{
	  if (lex_get(spell_best[r].id)->freq >= freq + spell_margin)
	    spell_best[w++] = spell_best[r];
	  r++;
	}
      if (w > 0)
	spell_n = w;
    }
}

void
spell_check(struct query *q)
{
  uns i, j, spell_requested=0;

  /* Prepare list of IDs of all known variants */
  spell_n_known_vars = 0;
  for (i=0; i<q->nwords; i++)
    {
      struct word *w = &q->words[i];
      if (!w->is_string && w->word_class != WC_COMPLEX)
	{
	  spell_n_known_vars += w->var_count;
	  spell_requested += w->options.spelling;
	}
    }
  if (!spell_requested)
    return;
  spell_known_vars = alloca(sizeof(uns) * spell_n_known_vars);
  j = 0;
  for (i=0; i<q->nwords; i++)
    {
      struct word *w = &q->words[i];
      struct ph_variant *v;
      if (!w->is_string && w->word_class != WC_COMPLEX)
	CLIST_WALK(v, w->variants)
	  spell_known_vars[j++] = v->lex_id;
    }
  ASSERT(j == spell_n_known_vars);

  /* Check all words from the query */
  for (i=0; i<q->nwords; i++)
    {
      struct word *w = &q->words[i];
      if (!w->is_string &&
	  w->use_count > w->hide_count && /* FIXME: Beware of WC_CONTEXT which can have use_count==0 after complexification */
	  w->word_class != WC_COMPLEX &&
	  !w->is_wild &&
	  w->options.spelling)
	{
	  spell_max = MIN(w->options.spelling, HARD_MAX_SPELLS);
	  spell_n = 0;
	  spell_word(i);
	  for (j=0; j<spell_n; j++)
	    {
	      byte wbuf[MAX_WORD_LEN+1];
	      lex_extract(spell_best[j].id, wbuf);
	      add_cr("S%s %s %d", wbuf, q->words[i].word, spell_best[j].pts);
	    }
	}
    }
}

#else

void spell_check(struct query *q UNUSED)
{
}

#endif

/*** Initialization ***/

static uns words_inited;

void
words_init(struct database *db)
{
  if (!db->fn_lexicon)
    return;
  if (!words_inited++)
    {
      lm_init();
      word_exc_init();
#ifdef CONFIG_LANG
      lang_init_stemmers();
#endif
    }
  lex_load(db);
#ifdef CONFIG_LANG
  stems_load(db);
#endif
}
