/*
 *	Sherlock Gatherer -- Character Set Guessing
 *
 *	(c) 1997--2001 Martin Mares <mj@ucw.cz>
 *	(c) 2004 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/lists.h"
#include "lib/fastbuf.h"
#include "lib/conf.h"
#include "lib/chartype.h"
#include "lib/unicode.h"
#include "lib/ff-utf8.h"
#include "gather/gather.h"
#include "charset/charconv.h"

#include <string.h>

/* Configuration */

static int trace_charsets;
static int log_charset_errors;
static int believe_server_charset;	/* 1=always, -1=never, 0=check */
static int believe_meta_charset;	/* 1=always, -1=never, 0=check */
static byte *fallback_charset = "iso-8859-1";
static int improbable_penalty = 5;
static int forbidden_penalty = 50;
static int utf8_penalty = 20;
static int believe_min_grade[2] = { 700, 0};

#define TRACE(x,y...) do { if (trace_charsets) log(L_DEBUG, x,##y); } while (0)
#define XTRACE(x,y...) do { if (trace_charsets > 1) log(L_DEBUG, x,##y); } while (0)

#define MAX_CHARSETS 32
COMPILE_ASSERT(num_charset_check, CONV_NUM_CHARSETS <= MAX_CHARSETS);

static u32 permitted_charsets;
static u32 *typical_chars, *forbidden_chars, *improbable_chars;

#define MAX_LANGS 32
#define DEFAULT_LANG 31

struct def_lang {
  node n;
  byte *host_patt, *lang_list;
};

struct ilang_map {
  struct ilang_map *next;
  byte *lang;
};

static list def_lang_list;
static byte *ilang_names[MAX_LANGS];
static struct ilang_map *ilang_langs[MAX_LANGS];
static byte ilang_charsets[MAX_LANGS][MAX_CHARSETS];
static int ilang_charset_count[MAX_LANGS];
static int num_ilangs;

static byte *
cc_add_charset(struct cfitem *item UNUSED, byte *value)
{
  int cs;

  cs = find_charset_by_name(value);
  if (cs < 0)
    return "Charset unknown to the conversion library";
  if (permitted_charsets & (1 << cs))
    return "Charset already defined";
  permitted_charsets |= 1 << cs;
  return NULL;
}

static int
lookup_charset(byte *name)
{
  int i = find_charset_by_name(name);
  if (i < 0 || !(permitted_charsets & (1 << i)))
    return -1;
  else
    return i;
}

static byte *
cc_add_deflang(struct cfitem *item UNUSED, byte *value)
{
  byte *w[2];
  struct def_lang *f = cfg_malloc(sizeof(struct def_lang));

  if (wordsplit(value, w, 2) != 2)
    return "DefLang syntax error";
  f->host_patt = w[0];
  f->lang_list = w[1];
  add_tail(&def_lang_list, &f->n);
  return NULL;
}

static byte *
cc_add_ilang(struct cfitem *item UNUSED, byte *value)
{
  byte *w[34];
  int c, i, il;

  c = wordsplit(value, w, 34);
  if (c < 2)
    return "Language syntax error";
  for (il=0; il < num_ilangs; il++)
    if (!strcasecmp(ilang_names[il], w[0]))
      return "Language already defined";
  if (num_ilangs == DEFAULT_LANG)
    return "Too many languages defined";
  ilang_names[num_ilangs] = w[0];
  for (i=1; i<c; i++)
    {
      struct ilang_map *map = cfg_malloc(sizeof(struct ilang_map));
      map->next = ilang_langs[num_ilangs];
      ilang_langs[num_ilangs] = map;
      map->lang = w[i];
    }
  XTRACE("Language %d is %s", num_ilangs, w[0]);
  num_ilangs++;
  return NULL;
}

static int
lookup_ilang(byte *name)
{
  int i;
  for(i=0; i<MAX_LANGS; i++)
    if (ilang_names[i] && !strcasecmp(ilang_names[i], name))
      return i;
  return -1;
}

static byte *
cc_add_autosets(struct cfitem *item UNUSED, byte *value)
{
  byte *w[34];
  int c, i, il, cs;

  c = wordsplit(value, w, 34);
  if (c < 2)
    return "AutoSets syntax error";
  il = lookup_ilang(w[0]);
  if (il < 0)
    return "Unknown language";
  if (ilang_charset_count[il])
    return "AutoSets for this language already defined";
  for (i=1; i<c; i++)
    {
      cs = lookup_charset(w[i]);
      if (cs < 0)
	return "Unknown charset";
      ilang_charsets[il][ilang_charset_count[il]++] = cs;
    }
  return NULL;
}

static void
allocate_charmaps(void)
{
  int size;

  if (typical_chars)
    return;
  size = conv_x_count();
  XTRACE("Allocating character class maps for %d characters", size);
  size *= sizeof(u32);
  typical_chars = xmalloc_zero(size);
  forbidden_chars = xmalloc_zero(size);
  forbidden_chars[0] = ~0;
  improbable_chars = xmalloc_zero(size);
}

static byte *
cc_add_chars(struct cfitem *item, byte *value)
{
  int char_type = item->name[1];
  byte *w[64];
  int il, i, n, l;
  uns code, lo, hi;
  u32 ilmask;

  if ((n = wordsplit(value, w, 64)) < 2)
    return "Syntax error";
  il = lookup_ilang(w[0]);
  if (il < 0)
    return "Unknown language";
  ilmask = (il == DEFAULT_LANG) ? ~0 : (1 << il);

  allocate_charmaps();
  for(i=1; i<n; i++)
    {
      l = strlen(w[i]);
      if (l != 4 && l != 9)
	return "UCS-2 code/range syntax error";
      if (l == 4)
	{
	  if (sscanf(w[i], "%x", &lo) != 1)
	    return "Invalid UCS-2 character code";
	  hi = lo;
	}
      else
	{
	  if (sscanf(w[i], "%x-%x", &lo, &hi) != 2)
	    return "Invalid UCS-2 code range";
	}
      if (lo > hi)
	return "Invalid code range";
      if (hi >= 0x10000)
	return "UCS-2 Character code out of range";
      if (!lo)
	{
	  if (char_type != 'F')
	    return "Null character is always illegal and cannot be redefined";
	  lo++;
	}
      for(code=lo; code<=hi; code++)
	{
	  uns x = conv_ucs_to_x(code);
	  if (x == 256)		/* Internal code of replacement char */
	    continue;
	  forbidden_chars[x] &= ~ilmask;	// clear the bits from all other bitmaps
	  typical_chars[x] &= ~ilmask;
	  improbable_chars[x] &= ~ilmask;
	  switch (char_type)
	  {
	    case 'F':
	      forbidden_chars[x] |= ilmask;
	      break;
	    case 'T':
	      typical_chars[x] |= ilmask;
	      break;
	    case 'I':
	      improbable_chars[x] |= ilmask;
	      break;
	    default:
	      ASSERT(0);
	  }
	}
    }
  return NULL;
}

static struct cfitem charset_config[] = {
  { "Charset",		CT_SECTION,	NULL },
  { "Trace",		CT_INT,		&trace_charsets },
  { "LogErrors",	CT_INT,		&log_charset_errors },
  { "BelieveServer",	CT_INT,		&believe_server_charset },
  { "BelieveMETA",	CT_INT,		&believe_meta_charset },
  { "FallbackCharset",	CT_STRING,	&fallback_charset },
  { "Charset",		CT_FUNCTION,	cc_add_charset },
  { "DefLang",		CT_FUNCTION,	cc_add_deflang },
  { "Language",		CT_FUNCTION,	cc_add_ilang },
  { "AutoSets",		CT_FUNCTION,	cc_add_autosets },
  { "CForbid",		CT_FUNCTION,	cc_add_chars },
  { "CTypical",		CT_FUNCTION,	cc_add_chars },
  { "CImprobable",	CT_FUNCTION,	cc_add_chars },
  { "ImprobablePenalty",CT_INT,		&improbable_penalty },
  { "ForbiddenPenalty",	CT_INT,		&forbidden_penalty },
  { "UTF8Penalty",	CT_INT,		&utf8_penalty },
  { "BelieveMinGrade",	CT_INT,		believe_min_grade },
  { "BelieveMinGrade2",	CT_INT,		believe_min_grade + 1 },
  { NULL,		CT_STOP,	NULL }
};

static void CONSTRUCTOR charset_init_config(void)
{
  cf_register(charset_config);
  init_list(&def_lang_list);
  ilang_names[DEFAULT_LANG] = "*";
}

/* Languages */

static u32
guess_langset(int *langarray)
{
  byte *lang, *serv_lang, *w[32], buf[256];
  u32 mask;
  int c, i, il;
  int nlangs = 0;
  struct ilang_map *map;

  if (!(lang = serv_lang = gthis->language))
    {
      struct def_lang *d;
      WALK_LIST(d, def_lang_list)
	if (match_pattern_nocase(d->host_patt, gthis->url_s.host))
	  {
	    lang = d->lang_list;
	    break;
	  }
    }
  mask = 0;
  if (lang)
    {
      strncpy(buf, lang, 255);
      buf[255] = 0;
      c = wordsplit(buf, w, 32);
      if (c < 0)
	c = 32;
      for (i=0; i<c; i++)
	{
	  for(il=0; il<num_ilangs; il++)
	    for(map=ilang_langs[il]; map; map=map->next)
	      if (!strcasecmp(map->lang, w[i]))
		{
		  if (!(mask & (1 << il)))
		    {
		      mask |= 1 << il;
		      langarray[nlangs++] = il;
		      goto break2;
		    }
		}
	  mask |= 1 << DEFAULT_LANG;
	break2:
	  ;
	}
    }
  if (!mask)
    mask = 1 << DEFAULT_LANG;

  if (mask & (1 << DEFAULT_LANG))
    langarray[nlangs++] = DEFAULT_LANG;
  langarray[nlangs] = -1;

  TRACE("Languages: orig=<%s> guess=<%s> mask=%08x",
	serv_lang ? : (byte *) "?",
	lang ? : (byte *) "?",
	mask);

  return mask;
}

/* Character set vaticination */

static void
calc_histogram(uns *hist)
{
  struct fastbuf *b = fbmem_clone_read(gthis->contents);
  int c;

  bzero(hist, 256*sizeof(hist[0]));
  while ((c = bgetc(b)) >= 0)
    hist[c]++;
  bclose(b);
}

static unsigned short int *get_charset_table(uns id)
{
  struct conv_context ct;

  conv_init(&ct);
  conv_set_charset(&ct, id, CONV_CHARSET_UTF8);
  return ct.in_to_x;
}

#define	MAX_GRADE	1000

static int
grade_histogram(byte *chs_name, unsigned short int *table, u32 langset, uns *histogram)
{
  int forbidden = 0, typical = 0, improbable = 0, non_ascii = 0;
  for (uns i=0; i<256; i++)
    if (histogram[i])
    {
      uns x = table[i];
      if (x >= 0x80 || x < 0x20 && !Cblank(x))
	non_ascii += histogram[i];
      if (forbidden_chars[x] & langset)
	forbidden += histogram[i];
      else if (typical_chars[x] & langset)
	typical += histogram[i];
      else if (improbable_chars[x] & langset)
	improbable += histogram[i];
    }
  int grade;
  if (non_ascii)
    grade = (typical - improbable * improbable_penalty - forbidden * forbidden_penalty) * MAX_GRADE / non_ascii;
  else
    grade = MAX_GRADE;
  XTRACE("%s: forbidden=%d, typical=%d, improbable=%d, non-ascii=%d ==> grade=%d", chs_name, forbidden, typical, improbable, non_ascii, grade);
  return grade;
}

static int
grade_utf8(void)
{
  struct fastbuf *b = fbmem_clone_read(gthis->contents);
  int c;
  uns code;
  int correct = 0, incorrect = 0, neutral = 0;

  while ((c = bgetc(b)) >= 0)
    {
      if (c < 0x80)
	;
      else if (c < 0xc0)
	incorrect++;
      else if (c >= 0xfe)
	incorrect++;
      else
	{
	  int cnt = 1;
	  while (c & (0x40 >> cnt))
	    cnt++;
	  code = c & (0x3f >> cnt);
	  while (cnt--)
	    {
	      c = bgetc(b);
	      if (c < 0x80 || c >= 0xc0)
	      {
		incorrect++;
		if (c >= 0xc0)
		  bungetc(b);
		goto on_err;
	      }
	      code = (code << 6) | (c & 0x3f);
	    }
	  if (code < 0x10000)
	    {
	      uns x = conv_ucs_to_x(code);
	      if (x != 256)
		correct++;
	      else
		neutral++;
	    }
on_err:
	  ;
	}
    }
  bclose(b);
  int grade;
  if (correct + incorrect + neutral)
    grade = (correct - incorrect * utf8_penalty) * MAX_GRADE / (correct + incorrect + neutral);
  else
    grade = MAX_GRADE;
  XTRACE("utf-8: correct=%d, incorrect=%d, neutral=%d ==> grade=%d", correct, incorrect, neutral, grade);
  return grade;
}

static int
check_charset(uns id, u32 langset, uns *histogram, int threshold)
{
  byte *chs_name = charset_name(id);
  int grade;

  if (id == CONV_CHARSET_UTF8)
    grade = grade_utf8();
  else
  {
    unsigned short int *table = get_charset_table(id);
    grade = grade_histogram(chs_name, table, langset, histogram);
  }

  XTRACE("Charset %s %s (grade %d, threshold %d)", chs_name, (grade >= threshold) ? "accepted" : "is improbable", grade, threshold);
  return grade;
}

static void
build_charset_list(int *list, int *languages)
{
  int i, j, k;
  u32 mask = 0;

  while ((i = *languages++) >= 0)
    {
      for (j=0; j<ilang_charset_count[i]; j++)
	{
	  k = ilang_charsets[i][j];
	  if (!(mask & (1 << k)))
	    {
	      mask |= 1 << k;
	      *list++ = k;
	    }
	}
    }
  *list = -1;
}

static int
guess_charset(u32 langset, int *languages, uns *histogram, int threshold)
{
  int list[MAX_CHARSETS+1];
  int bestgrd = MAX(threshold, -1), bestset = -1;
  int i, set, grade;

  build_charset_list(list, languages);
  for (i=0; list[i] >= 0; i++)
    {
      set = list[i];
      if (set == CONV_CHARSET_UTF8)
	{
	  grade = grade_utf8();
	}
      else
	{
	  unsigned short int *table = get_charset_table(set);
	  grade = grade_histogram(charset_name(set), table, langset, histogram);
	}
      if (grade > bestgrd)
	{
	  bestgrd = grade;
	  bestset = set;
	}
    }
  return bestset;
}

static int
guess_fallback(int *languages)
{
  int il = languages[0];
  if (ilang_charset_count[il] < 0)
    {
      log(L_ERROR, "No charsets defined for language %s", ilang_names[il]);
      return -1;
    }
  return ilang_charsets[il][0];
}

/* Conversion */

static void
convert_stream_charset(uns id)
{
  struct fastbuf *src, *dest;
  struct conv_context ct;
  uns flags, len;
  byte destbuf[4096];

  if (id == CONV_CHARSET_UTF8)
  {
    if (grade_utf8() < MAX_GRADE)
    {
      XTRACE("Repairing broken UTF-8");
      src = fbmem_clone_read(gthis->contents);
      dest = fbmem_create(16384);
      int c;
      while ((c = bget_utf8(src)) != EOF)
	bput_utf8(dest, c);
      bclose(src);
      bclose(gthis->contents);
      gthis->contents = dest;
    }
    return;
  }

  XTRACE("Recoding to canonical charset");
  src = fbmem_clone_read(gthis->contents);
  dest = fbmem_create(16384);
  conv_init(&ct);
  conv_set_charset(&ct, id, CONV_CHARSET_UTF8);
  do
    {
      flags = conv_run(&ct);
      if (flags & CONV_DEST_END)
	{
	  if (ct.dest != ct.dest_start)
	    bwrite(dest, destbuf, ct.dest - destbuf);
	  ct.dest = destbuf;
	  ct.dest_end = destbuf + sizeof(destbuf);
	}
      else if (flags & CONV_SOURCE_END)
	{
	  byte *s;
	  len = bdirect_read_prepare(src, &s);
	  if (!len)
	    break;
	  ct.source = s;
	  ct.source_end = s + len;
	  bdirect_read_commit(src, s+len);
	}
      else
	ASSERT(0);
    }
  while (1);
  if (ct.dest != ct.dest_start)
    bwrite(dest, destbuf, ct.dest - destbuf);

  bclose(src);
  bclose(gthis->contents);
  gthis->contents = dest;
}

/* Main decision routine */

void
convert_charset(byte *meta_charset)
{
  uns histogram[256];
  int languages[MAX_LANGS+1];
  u32 langset;
  byte *cs;
  int s_id = -1, m_id = -1, g_id = -1, id = -1;
  byte *s_rem = "";
  byte *m_rem = "";
  byte *source = "?";

  allocate_charmaps();
  langset = guess_langset(languages);
  ASSERT(languages[0] >= 0);
  calc_histogram(histogram);

  int pass = 0;
  int doc_grade;
start_pass:
  doc_grade = -1000000;

  /* Charset supplied by the server */
  if (cs = gthis->charset)
    {
      source = "server";
      if ((s_id = lookup_charset(cs)) < 0)
	{
	  if (log_charset_errors)
	    log(L_ERROR_R, "Unrecognized server charset %s", cs);
	  s_rem = " [unknown]";
	}
      else if (believe_server_charset > 0)
	{
	  s_rem = " [forced-ok]";
	  id = s_id;
	  goto okay;
	}
      else if (believe_server_charset < 0)
	s_rem = " [forced-no]";
      else 
      {
	int grade = check_charset(s_id, langset, histogram, believe_min_grade[pass]);
	if (grade >= believe_min_grade[pass])
	{
	  s_rem = " [confirmed]";
	  id = s_id;
	  goto okay;
	}
	else
	{
	  if (log_charset_errors)
	    log(L_ERROR_R, "Mismatched server charset %s", cs);
	  s_rem = " [disproved]";
	  doc_grade = grade;
	}
      }
    }
  if (believe_server_charset > 0)	/* Fallback */
    goto okay;

  /* Charset supplied by META tags */
  if (cs = meta_charset)
    {
      source = "META";
      if ((m_id = lookup_charset(cs)) < 0)
	{
	  if (log_charset_errors)
	    log(L_ERROR_R, "Unrecognized META charset %s", cs);
	  m_rem = " [unknown]";
	}
      else if (believe_meta_charset > 0)
	{
	  m_rem = " [forced-ok]";
	  id = m_id;
	  goto okay;
	}
      else if (believe_meta_charset < 0)
	m_rem = " [forced-no]";
      else
      {
	int grade = check_charset(m_id, langset, histogram, believe_min_grade[pass]);
	if (grade >= believe_min_grade[pass])
	{
	  m_rem = " [confirmed]";
	  id = m_id;
	  goto okay;
	}
	else
	{
	  if (log_charset_errors)
	    log(L_ERROR_R, "Mismatched META charset %s", cs);
	  m_rem = " [disproved]";
	  if (grade >= doc_grade)
	    doc_grade = grade;
	}
      }
    }
  if (believe_meta_charset > 0)		/* Fallback */
    goto okay;

  /* Try auto-detection */
  if ((g_id = guess_charset(langset, languages, histogram, doc_grade)) >= 0)
    {
      source = "guessed";
      id = g_id;
      goto okay;
    }
  else if (!pass && (s_id >= 0 || m_id >= 0))
    {
      TRACE("Entering 2nd pass with smaller threshold");
      pass = 1;
      goto start_pass;
    }
  else if ((g_id = guess_fallback(languages)) >= 0)
  {
    if (check_charset(g_id, langset, histogram, doc_grade) > doc_grade)
    {
      source = "guessed-fallback";
      id = g_id;
      goto okay;
    }
  }

  /* Everything failed, try server/meta charset or fall back */
  if (s_id >= 0)
    {
      source = "fallback-server";
      id = s_id;
    }
  else if (m_id >= 0)
    {
      source = "fallback-meta";
      id = m_id;
    }

 okay:
  if (id < 0)
    {
      source = "fallback";
      id = lookup_charset(fallback_charset);
      if (id < 0)
	{
	  log(L_ERROR, "Unrecognized fallback charset: %s", fallback_charset);
	  id = CONV_CHARSET_ASCII;
	}
    }
  cs = charset_name(id);

  TRACE("Charsets: server=<%s>%s meta=<%s>%s -> %s [%s]",
	gthis->charset ? : (byte *) "?", s_rem,
	meta_charset ? : (byte *) "?", m_rem,
	cs, source);
  gthis->charset = cs;

  convert_stream_charset(id);
}
