/*
 *	Sherlock Indexer -- Character Class Table for Lexical Mapping
 *
 *	(c) 2003 Martin Mares <mj@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/chartype.h"
#include "lib/unicode.h"
#include "charset/unicat.h"
#include "indexer/alphabet.h"

#include <string.h>

byte alpha_class[65536];

static int
scan_hex(byte *c)
{
  int x = 0;
  for (int i=0; i<4; i++)
    {
      if (!Cxdigit(c[i]))
	return -1;
      x = 16*x + Cxvalue(c[i]);
    }
  return x;
}

static void
set_chars(int min, int max, uns class)
{
  while (min <= max)
    {
      uns c = class;
      if (c == AC_INHERIT)
	{
	  uns u = Ucategory(min);
	  if (u & _U_LETTER)
	    c = AC_ALPHA;
	  else if (u & _U_DIGIT)
	    c = AC_DIGIT;
	  else if (u & _U_SPACE)
	    c = AC_SPACE;
	  else if (u & _U_LIGATURE)
	    c = AC_LIGATURE;
	  else
	    c = AC_PUNCT;
	}
      alpha_class[min] = c;
      min++;
    }
}

static byte *
alphabet_conf_class(struct cfitem *c, byte *arg)
{
  byte *w[64];
  uns class;

  int cnt = wordsplit(arg, w, ARRAY_SIZE(w));
  if (cnt < 0)
    return "Too many classes specified in a single line";
  switch (c->name[4])
    {
    case 'r': class = AC_INHERIT; break;
    case 'e': class = AC_SPACE; break;
    case 'a': class = AC_ALPHA; break;
    case 't': class = AC_PUNCT; break;
    case 'k': class = AC_BREAK; break;
    case 'l': class = AC_SINGLETON; break;
    default: ASSERT(0);
    }
  for (int i=0; i<cnt; i++)
    {
      int min, max;
      byte *s = w[i];
      int l = strlen(s);
      if (l == 1)
	min = max = *s;
      else if (l == 3 && s[1] == '-')
	{
	  min = s[0];
	  max = s[2];
	}
      else if (l == 4)
	min = max = scan_hex(s);
      else if (l == 9 && s[4] == '-')
	{
	  min = scan_hex(s);
	  max = scan_hex(s+5);
	}
      else
	return "Invalid character range";
      if (min < 0 || max < 0 || min > max)
	return "Invalid character code or range";
      set_chars(min, max, class);
    }
  return NULL;
}

static struct cfitem alphabet_config[] = {
  { "Alphabet",		CT_SECTION,	NULL },
  { "Inherit",		CT_FUNCTION,	alphabet_conf_class },
  { "Space",		CT_FUNCTION,	alphabet_conf_class },
  { "Alpha",		CT_FUNCTION,	alphabet_conf_class },
  { "Punct",		CT_FUNCTION,	alphabet_conf_class },
  { "Break",		CT_FUNCTION,	alphabet_conf_class },
  { "Singleton",	CT_FUNCTION,	alphabet_conf_class },
  { NULL,		CT_STOP,	NULL }
};

static void CONSTRUCTOR
alphabet_init(void)
{
  cf_register(alphabet_config);
}
