/*
 *	Language detector
 *
 * 	(c) 2003, Robert Spalek <robert@ucw.cz>
 *
 * 	Inspired by open-source program `text_cat'
 * 		http://odur.let.rug.nl/~vannoord/TextCat/
 * 	based on the text categorization algorithm presented in
 * 		Cavnar, W. B.  and J. M. Trenkle,
 * 		``N-Gram-Based Text Categorization''
 * 	In Proceedings of Third Annual Symposium on Document Analysis and
 * 	Information Retrieval, Las Vegas, NV, UNLV Publications/Reprographics,
 * 	pp. 161-175, 11-13 April 1994.
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/lists.h"
#include "lib/mempool.h"
#include "lib/math.h"
#include "lib/unicode.h"
#include "lang/lang.h"
#include "lang/detect.h"
#include "lang/kmp.h"

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/***** Configuration *****/

uns lang_detect_nr_langs;
struct lang_detect_lang_flag lang_detect_lang_flags[MAX_DETECTED];

uns lang_detect_nr_sequences, lang_detect_max_sequences, lang_detect_total_seq_len;
struct lang_detect_sequence **lang_detect_sequences;

uns lang_detect_min_doc_length;
byte *lang_detect_tables_file;

static byte *
reset(struct cfitem *c UNUSED, byte *arg)
{
	lang_detect_nr_langs = 0;
	bzero(lang_detect_lang_flags, sizeof(lang_detect_lang_flags));
	if (!arg || !*arg)
		return "Expecting an upper-bound of the number of sequences";
	lang_detect_nr_sequences = 0;
	lang_detect_max_sequences = atoi(arg);
	lang_detect_total_seq_len = 0;
	lang_detect_sequences = cfg_malloc(lang_detect_max_sequences * sizeof(struct lang_detect_sequence *));
	return NULL;
}

static byte *
add_language(struct cfitem *c, byte *arg)
{
	if (lang_detect_nr_sequences)
		return "Too late to tie a language";
	if (lang_detect_nr_langs >= MAX_DETECTED)
		return "Too many languages";
	byte *w[3];
	int n;
	if ((n = wordsplit(arg, w, ARRAY_SIZE(w))) != 3)
		return "3 fields expected";
	int lang = lang_name_to_code(w[0]);
	if (lang < 0)
		return "Unknown language";
	ASSERT(lang >= 0 && lang < MAX_DETECTED);
	byte is_accented = (c->name[3] == 'N');
	for (uns i=0; i<lang_detect_nr_langs; i++)
		if (lang_detect_lang_flags[i].id == lang
		&& lang_detect_lang_flags[i].is_accented == is_accented)
			return "The language variant has already been tied";
	uns s, t;
	if (sscanf(w[1], "%d", &s) != 1)
		return "Cannot parse the number of sequences";
	if (sscanf(w[2], "%d", &t) != 1)
		return "Cannot parse the threshold";
	struct lang_detect_lang_flag *r = lang_detect_lang_flags + lang_detect_nr_langs++;
	r->id = lang;
	r->is_accented = is_accented;
	r->nr_seq = s;
	r->threshold = t;
	return NULL;
}

static byte *
add_sequence(struct cfitem *c UNUSED, byte *arg)
{
	if (!lang_detect_nr_langs)
		return "No language has been tied";
	byte *w[lang_detect_nr_langs+1];
	int n;
	if ((n = wordsplit(arg, w, ARRAY_SIZE(w))) != (int) lang_detect_nr_langs+1)
		return "TIED_LANG_COUNT+1 fields expected";
	n = utf8_strlen(w[0]);
	if (n<1 || n>MAX_SEQ_LENGTH)
		return "The sequence must be from 1 to MAX_SEQ_LENGTH characters long";
	struct lang_detect_sequence *seq;
	for (uns i=0; i<lang_detect_nr_sequences; i++)
		if (!strcasecmp(lang_detect_sequences[i]->text, w[0]))
			return "The sequence has already been defined";
	seq = cfg_malloc(sizeof(struct lang_detect_sequence) + n);
	bzero(seq, sizeof(struct lang_detect_sequence) + n);
	strcpy(seq->text, w[0]);
	seq->len = n;
	lang_detect_total_seq_len += n;
	for (uns i=0; i<lang_detect_nr_langs; i++)
	{
		int f;
		if (sscanf(w[i+1], "%d", &f) != 1)
			return "Cannot parse the order";
		if (f < 0)
			return "Order cannot be a non-negative number";
		if (f > (int) lang_detect_lang_flags[i].nr_seq)
			return "Order cannot be higher than the number of sequences of the language";
		seq->order[i] = f;
	}
	if (lang_detect_nr_sequences >= lang_detect_max_sequences)
		return "Too many sequences in comparison with the upper-bound";
	lang_detect_sequences[lang_detect_nr_sequences++] = seq;
	return NULL;
}

static struct cfitem lang_detect_config[] = {
  { "LangDetect",		CT_SECTION,	NULL },
  { "Reset",			CT_FUNCTION,	reset },
  { "AddNormalLanguage",	CT_FUNCTION,	add_language },
  { "AddCutAccentLanguage",	CT_FUNCTION,	add_language },
  { "SequenceFrequencies",	CT_FUNCTION,	add_sequence },
  { "MinDocumentLength",	CT_INT,		&lang_detect_min_doc_length },
  { "IncludeTables",		CT_STRING,	&lang_detect_tables_file },
  { NULL,			CT_STOP,	NULL }
};

static void CONSTRUCTOR
lang_detect_init(void)
{
	cf_register(lang_detect_config);
	reset(NULL, "0");
}

static struct kmp *aut;		/* the automaton searching the sequences */
static struct lang_detect_results results;

void
lang_detect_build_automaton(void)
{
	if (!lang_detect_tables_file)
		die("Set LangDetect.IncludeTables to the file with language recognition tables");
	cf_read(lang_detect_tables_file);

	static struct mempool *mp_aut;		/* reusable memory pool for temporary data-structures */
	if (!mp_aut)
		mp_aut = mp_new(1 << 12);
	else
		mp_flush(mp_aut);

	/* Build a new automaton from the sequences.  */
	aut = kmp_new(mp_aut, lang_detect_total_seq_len+1, MF_TOLOWER | MF_ONLYALPHA);
	for (uns i=0; i<lang_detect_nr_sequences; i++)
		kmp_enter_string(aut, lang_detect_sequences[i]->text, i);
	kmp_build(aut);

	init_list(&results.nonzeroes);
	results.occurences = lang_detect_nr_sequences
		? mp_alloc_zero(mp_aut, lang_detect_nr_sequences * sizeof(struct kmp_result))
		: NULL;
	results.variances = lang_detect_nr_langs
		? mp_alloc(mp_aut, lang_detect_nr_langs * sizeof(uns))
		: NULL;
	results.sf = lang_detect_nr_sequences
		? mp_alloc(mp_aut, lang_detect_nr_sequences * sizeof(struct sequence_freq))
		: NULL;
}

void
lang_detect_start(void)
{
	struct kmp_result *n;
	WALK_LIST(n, results.nonzeroes)
		n->occur = 0;
	init_list(&results.nonzeroes);
}

void
lang_detect_add_string(byte *str)
{
	kmp_search(aut, str, &results.nonzeroes, results.occurences);
}

#define	ASORT_PREFIX(x)	seq_freq_##x
#define	ASORT_KEY_TYPE	uns
#define	ASORT_ELT(i)	-array[i].occur
#define	ASORT_SWAP(i,j)	do { struct sequence_freq tmp=array[j]; array[j]=array[i]; array[i]=tmp; } while(0)
#define	ASORT_EXTRA_ARGS	, struct sequence_freq *array
#include "lib/arraysort.h"

#define	TRACE(mask,par...)	if (0) fprintf(stderr, mask "\n",##par)

static uns
probability_of_language(uns id, struct sequence_freq *sorted_freqs, uns count)
{
	uns result = 0;
	TRACE("Computing frequence of language %d, I found %d/%d sequences:", id, count, lang_detect_nr_sequences);
	if (count > lang_detect_lang_flags[id].nr_seq)
	{
		count = lang_detect_lang_flags[id].nr_seq;
		TRACE("The count cut down to %d", lang_detect_lang_flags[id].nr_seq);
	}
	for (uns i=0; i<count; i++)
	{
		uns expected_pos = lang_detect_sequences[ sorted_freqs[i].id ]->order[id];
		if (expected_pos)
		{
			result += abs(expected_pos - i);
			TRACE("\tSeq #%d with %d occ. should be at #%d, += difference %d",
				i, sorted_freqs[i].occur, expected_pos, abs(expected_pos - i));
		}
		else
		{
			result += lang_detect_lang_flags[id].nr_seq;
			TRACE("\tSeq #%d with %d occ. should not be present, += penalty %d",
				i, sorted_freqs[i].occur, lang_detect_lang_flags[id].nr_seq);
		}
	}
	TRACE("Total variance is %d", result);
	return result;
}

struct lang_detect_results *
lang_detect_compute(void)
{
	struct kmp_result *n;
	results.total_occur = 0;
	results.nonzero_seq = 0;
	WALK_LIST(n, results.nonzeroes)
	{
		ASSERT(n->occur);
		results.total_occur += n->occur;
		results.sf[results.nonzero_seq].id = n - results.occurences;
		results.sf[results.nonzero_seq++].occur = n->occur;
	}
	ASSERT(results.nonzero_seq <= lang_detect_nr_sequences);
	seq_freq_sort(results.nonzero_seq, results.sf);

	if (results.total_occur >= lang_detect_min_doc_length
	&& lang_detect_nr_langs >= 1)
	{
		results.lang1 = 0;
		for (uns i=0; i<lang_detect_nr_langs; i++)
		{
			results.variances[i] = probability_of_language(i, results.sf, results.nonzero_seq);
			if (i > 0 && results.variances[i] < results.variances[ results.lang1 ])
				results.lang1 = i;
		}
		results.lang2 = lang_detect_nr_langs;
		for (uns i=0; i<lang_detect_nr_langs; i++)
		{
			/* Do not compare it with the other accented variant of the same language.  */
			if (lang_detect_lang_flags[i].id != lang_detect_lang_flags[ results.lang1 ].id
			&& (results.lang2 == (int) lang_detect_nr_langs
			|| results.variances[i] < results.variances[ results.lang2 ]))
				results.lang2 = i;
		}
		if (results.lang2 == (int) lang_detect_nr_langs)
			results.lang2 = results.lang1;
		results.ratio = 1000 * results.variances[ results.lang2 ]  / results.variances[ results.lang1 ];
		results.min_ratio = lang_detect_lang_flags[ results.lang1 ].threshold;
	}
	else
	{
		results.lang1 = results.lang2 = -1;
		for (uns i=0; i<lang_detect_nr_langs; i++)
			results.variances[i] = 0;
		results.ratio = 0;
		results.min_ratio = 1;
	}
	return &results;
}

uns
lang_detect_choose_best(void)
{
	struct lang_detect_results *res = lang_detect_compute();
	if (res->lang1 < 0 || res->ratio < res->min_ratio)
		return LANG_UNKNOWN;
	else
		return lang_detect_lang_flags[ res->lang1 ].id;
}
