/*
 *	Sherlock Indexer -- Merging Documents According to Signatures
 *
 *	(c) 2002--2004, Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/fastbuf.h"
#include "sherlock/object.h"
#include "lib/bitarray.h"
#include "indexer/indexer.h"
#include "indexer/attrs.h"
#include "indexer/merges.h"
#include "indexer/matcher.h"

#include <stdlib.h>
#include <time.h>
#include <alloca.h>

#define	COMPARED	8
struct sign_key 
{
	u32 sign[COMPARED];
	area_t area;
};

struct signature
{
	uns cardid;
	u32 sign[0];
};

/* Opened files, computed constants, and counters.  */
static bitarray_t untouchable;
static struct fastbuf *matches;
static uns distinct_limit, record_size, starting_key;
static uns skipped, similar;

/*** Sorting the Signatures file.  ***/

#define SORT_KEY struct sign_key
#define SORT_PREFIX(x) sign_##x
#define SORT_PRESORT
#define SORT_INPUT_FILE
#define SORT_OUTPUT_FB

static inline int
sign_compare(struct sign_key *a, struct sign_key *b)
{
	int i;
	for (i=0; i<COMPARED; i++)
		COMPARE(a->sign[i], b->sign[i]);
	return 0;
}

static inline int
sign_fetch_key(struct fastbuf *f, struct sign_key *k)
{
	while (1)
	{
		sh_off_t curr_pos, new_pos;
		u32 cardid;
		int res;

		curr_pos = btell(f);
		if ((cardid = bgetl(f)) == ~0U)
			return 0;
		if (bit_array_isset(untouchable, cardid)
		|| (int) merges[cardid] >= 0)
		{
			/* Untouchable or already marked as a duplicate.  */
			bsetpos(f, curr_pos + record_size);
			skipped++;
			continue;
		}
		new_pos = curr_pos + sizeof(uns) + starting_key * sizeof(u32);
		bsetpos(f, new_pos);
		res = breadb(f, k, COMPARED * sizeof(u32));
		bsetpos(f, curr_pos);
		return res;
	}
}

static inline void
sign_copy_data(struct fastbuf *src, struct fastbuf *dest, struct sign_key *k UNUSED)
{
	bbcopy(src, dest, record_size);
}

static inline byte *
sign_fetch_item(struct fastbuf *f, struct sign_key *k, byte *limit)
{
	byte *pos = (byte *) (k + 1);
	if (pos + record_size > limit)
		return NULL;
	breadb(f, pos, record_size);
	return pos + record_size;
}

static inline void
sign_store_item(struct fastbuf *f, struct sign_key *k)
{
	bwrite(f, k + 1, record_size);
}

#include "lib/sorter.h"

static uns
find_random_key(uns *forbidden_keys, uns count)
{
	while (1)
	{
		uns key, i;
		key = random() % (matcher_signatures - COMPARED + 1);
		for (i=0; i<count; i++)
			if (key == forbidden_keys[i])
				break;
		if (i >= count)
			return key;
	}
}

/*** Processing the sorted Signatures file.  ***/

static inline int
similar_documents(struct signature *a, struct signature *b)
{
	uns distinct = 0, i;
	for (i=0; i<matcher_signatures; i++)
		if (a->sign[i] != b->sign[i])
		{
			if (distinct >= distinct_limit)
				return 0;
			distinct++;
		}
	if (matches)
	{
		char tmp[128];
		sprintf(tmp, "%x\t%x\t%d\n", a->cardid, b->cardid, distinct);
		bputs(matches, tmp);
	}
	return 1;
}

static int
process_block(struct fastbuf *f, void *buffer)
{
	struct signature *first = buffer, *curr;
	uns read = 0;
	int eof = 0;
	int i, j;

	if (!breadb(f, buffer, record_size))
		return 0;
	buffer += record_size;
	read++;
	ASSERT(matcher_block > 1);
	while (read < matcher_block)
	{
		if (!breadb(f, buffer, record_size))
		{
			eof = 1;
			break;
		}
		curr = buffer;
		if (curr->sign[starting_key] != first->sign[starting_key]
		    )
			break;
		buffer += record_size;
		read++;
	}
	buffer = first;
	/* If !eof, we have read one signature more than we wanted (either we
	 * have reached a new key hash, or the buffer is full and we want to
	 * overlap the last record).  */
	if (!eof)
		bseek(f, (int) -record_size, SEEK_CUR);
	for (i=1; i<(int) read; i++)
	{
		curr = buffer + i * record_size;
		for (j=i-1; j>=0; j--)
		{
			first = buffer + j * record_size;
			if (similar_documents(first, curr))
			{
				similar++;
				merges_union(first->cardid, curr->cardid);
				break;
			}
		}
	}
	return !eof;
}

/*
 * This procedure reads a signature file sorted on a hash chosen by random and
 * tries to identify similar documents.  Since similar documents have about 90%
 * of the hashes equal, it is highly probable that the sorting key belongs to
 * this set at least in one of the passes (99% for 2 passes, 99.9% for 3
 * passes).  Hence it suffices to process separately documents for every value
 * of the sorting key.  We read signatures according to a fixed key into memory
 * and find similarities.  It would be ideal to read all of them, but since
 * the algorithm is quadratic, we process the signatures in blocks of size 64
 * --- it does not matter so much.
 */
static void
process_file(struct fastbuf *signatures)
{
	void *buffer = alloca(matcher_block * record_size);
	while (process_block(signatures, buffer));
}

static void
find_similarities(void)
{
	uns forbid[matcher_passes];
	uns i;

	srand(time(NULL));
	if (matcher_signatures < 2*COMPARED)
		die("Too small value Matcher.Signatures = %d, minimal value is %d", matcher_signatures, 2*COMPARED);
	distinct_limit = matcher_signatures - matcher_threshold;
	record_size = sizeof(uns) + matcher_signatures * sizeof(u32);

	for (i=0; i<matcher_passes; i++)
	{
		struct fastbuf *signatures;

		starting_key = forbid[i] = find_random_key(forbid, i);
		skipped = 0;
		signatures = sign_sort(index_name(fn_signatures));
		log(L_INFO, "Sorted signatures on the %d-th hash, skipped %d documents", starting_key, skipped);

		similar = 0;
		process_file(signatures);
		bclose(signatures);
		if (similar)
			log(L_INFO, "Found %d similar documents", similar);
	}
}

int
main(int argc, char **argv)
{
	log_init(argv[0]);
	if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
		optind < argc)
	{
		fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
		exit(1);
	}

	if (!matcher_signatures)
		return 0;

	log(L_INFO, "Browsing attributes");
	merges_map(1);
	untouchable = xmalloc(BIT_ARRAY_BYTES(card_count));
	bit_array_zero(untouchable, card_count);
	attrs_part_map(0);
	for (uns i=0; i<card_count; i++)
		if (bring_attr(i)->flags & (CARD_FLAG_EMPTY | CARD_FLAG_IMAGE | CARD_FLAG_FRAMESET))
			bit_array_set(untouchable, i);
	attrs_part_unmap();

	log(L_INFO, "Merging documents according to signatures");
	if (fn_matches)
		matches = index_bopen(fn_matches, O_CREAT | O_TRUNC | O_WRONLY);
	else
		matches = NULL;

	find_similarities();

	xfree(untouchable);
	merges_unmap();
	if (matches)
		bclose(matches);

	return 0;
}
