/*
 *	Sherlock Language Processing Library -- Basic functions
 *
 *	(c) 2003 Martin Mares <mj@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/chartype.h"
#include "lang/lang.h"

#include <string.h>

/* Configuration */

struct lang_name {
  node n;
  uns id;
  byte name[1];
};

static list lang_names;
list stemmer_list, syndict_list;
static struct lang_name *lang_canon_names[MAX_LANGUAGES];
uns lang_count = 1;

static byte *
lang_uage_cf(struct cfitem *c UNUSED, byte *arg)
{
  byte *w[32];
  int n;

  if ((n = wordsplit(arg, w, ARRAY_SIZE(w))) <= 0)
    return "1 to 32 fields expected";
  if (lang_count >= MAX_LANGUAGES)
    return "Too many languages defined";
  for (int i=0; i<n; i++)
    {
      struct lang_name *ln = cfg_malloc(sizeof(struct lang_name) + strlen(w[i]));
      ln->id = lang_count;
      if (!i)
	lang_canon_names[lang_count] = ln;
      strcpy(ln->name, w[i]);
      add_tail(&lang_names, &ln->n);
    }
  lang_count++;
  return NULL;
}

static byte *stemmer_names[] = {
#define STEMMER(name) #name,
#include "lang/stemmers.h"
#undef STEMMER
};

static byte *
lang_stemmer_cf(struct cfitem *c UNUSED, byte *arg)
{
  byte *w[3];
  struct stemmer *st, *t;
  int n, lang;

  if ((n = wordsplit(arg, w, 3)) < 2)
    return "Malformed stemmer declaration";
  st = cfg_malloc(sizeof(struct stemmer));
  if ((lang = lang_name_to_code(w[0])) < 0)
    return "Unknown language";
  st->lang_mask = 1 << lang;
  st->name = w[1];
  for (st->id = 0; st->id < ARRAY_SIZE(stemmer_names); st->id++)
    if (!strcasecmp(st->name, stemmer_names[st->id]))
      break;
  if (st->id >= ARRAY_SIZE(stemmer_names))
    return "Unknown stemmer algorithm";
  st->params = (n > 2) ? w[2] : (byte *)"";
  WALK_LIST(t, stemmer_list)
    if (t->id == st->id && !strcmp(t->params, st->params))
      {
	t->lang_mask |= st->lang_mask;
	return NULL;
      }
  add_tail(&stemmer_list, &st->n);
  return NULL;
}

static byte *
lang_syndict_cf(struct cfitem *c UNUSED, byte *arg)
{
  byte *w[2];
  struct syndict *st, *t;
  int n, lang;

  if ((n = wordsplit(arg, w, 2)) != 2)
    return "Malformed SynDict declaration";
  st = cfg_malloc(sizeof(struct syndict));
  if ((lang = lang_name_to_code(w[0])) < 0)
    return "Unknown language";
  st->lang_mask = 1 << lang;
  st->name = w[1];
  WALK_LIST(t, syndict_list)
    if (!strcmp(t->name, st->name))
      {
	t->lang_mask |= st->lang_mask;
	return NULL;
      }
  add_tail(&syndict_list, &st->n);
  return NULL;
}

static struct cfitem lang_config[] = {
  { "Lang",		CT_SECTION,	NULL },
  { "Language",		CT_FUNCTION,	lang_uage_cf },
  { "Stemmer",		CT_FUNCTION,	lang_stemmer_cf },
  { "SynDict",		CT_FUNCTION,	lang_syndict_cf },
  { NULL,		CT_STOP,	NULL }
};

static void CONSTRUCTOR lang_init(void)
{
  cf_register(lang_config);
  init_list(&lang_names);
  init_list(&stemmer_list);
  init_list(&syndict_list);
}

/* Language names and codes */

int
lang_name_to_code(byte *name)
{
  struct lang_name *ln;

  WALK_LIST(ln, lang_names)
    if (!strcasecmp(ln->name, name))
      return ln->id;
  return -1;
}

byte *
lang_code_to_name(uns code)
{
  if (!code || code >= lang_count)
    return "??";
  else
    return lang_canon_names[code]->name;
}

int
lang_list_to_code(byte *langs)
{
  byte tag[9];

  while (Cspace(*langs))
    langs++;
  byte *t = langs;
  while (Calpha(*t))
    t++;
  uns l = t-langs;
  if (!l || l>8)
    return -1;
  memcpy(tag, langs, l);
  tag[l] = 0;
  return lang_name_to_code(tag);
}
