/*
 *	Sherlock Utilities -- Index Dumper
 *
 *	(c) 2001--2004 Robert Spalek <robert@ucw.cz>
 *	(c) 2002--2004 Martin Mares <mj@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/fastbuf.h"
#include "sherlock/bucket.h"
#include "lib/chartype.h"
#include "sherlock/object.h"
#include "lib/url.h"
#include "lib/unicode.h"
#include "sherlock/lizard-fb.h"
#include "sherlock/tagged-text.h"
#include "charset/charconv.h"
#include "charset/unicat.h"
#include "charset/fb-charconv.h"
#include "indexer/indexer.h"
#include "indexer/lexicon.h"
#include "indexer/params.h"
#include "utils/dumpconfig.h"
#include "lang/lang.h"

#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <alloca.h>

static int term_charset_id;
static struct lizard_buffer *liz_buf;
static struct fastbuf *output;
static int verbose;				/* Expand object attribute names?  */
static int bare;				/* Avoid headings */
static int raw;					/* Do not parse carse */

/* Maximal input and output line length.  */
#define	BUFSIZE		2048
#define	LINE_LEN	512

static void
dump_card_attr(u64 id, void *tmp)
{
	struct card_attr *a = tmp;
	byte *attrs = "EADMIFO*";
	byte at[9];
	byte tf[32];
	uns i;

	if (!a)
	{
		if (!verbose)
			bprintf(output, "ID       Card     SiteID     Area Wgt %s Age Type\n", attrs);
		return;
	}
	for (i=0; i<8; i++)
	  at[i] = (a->flags & (1<<i)) ? attrs[i] : '-';
	at[i] = 0;
#ifdef CONFIG_FILETYPE
	byte *x = tf + sprintf(tf, "%02x %s", a->type_flags, custom_file_type_names[CA_GET_FILE_TYPE(a)]);
#ifdef CONFIG_LANG
	if (!(a->type_flags & 0x80))
	  sprintf(x, ", %s", lang_code_to_name(CA_GET_FILE_LANG(a)));
#endif
	if (a->type_flags & 0x80)
		sprintf(x, " other: %d/%d", CA_GET_FILE_TYPE(a), CA_GET_FILE_INFO(a));
#else
	tf[0] = 0;
#endif
	if (verbose)
	{
		bprintf(output, "Attribute %x:\n", (u32) id);
		bprintf(output, "Card:\t%x\n", a->card);
		bprintf(output, "Weight:\t%d\n", a->weight);
		bprintf(output, "Flags:\t%s\n", at);
#ifdef CONFIG_LASTMOD
		bprintf(output, "Age:\t%d\n", a->age);
#endif
#ifdef CONFIG_FILETYPE
		bprintf(output, "Type:\t%s\n", tf);
#endif
#define INT_ATTR(id,keywd,gf,pf) bprintf(output, "Custom " #id ":\t%x\n", gf(a));
#define SMALL_SET_ATTR(id,keywd,gf,pf) bprintf(output, "Custom " #id ":\t%x\n", gf(a));
#define LATE_INT_ATTR INT_ATTR
#define LATE_SMALL_SET_ATTR SMALL_SET_ATTR
  EXTENDED_ATTRS
#undef INT_ATTR
#undef SMALL_SET_ATTR
#undef LATE_INT_ATTR
#undef LATE_SMALL_SET_ATTR
		bputc(output, '\n');
	}
	else
	{
		bprintf(output, "%8x %8x %08x %6d %3d %s %3d %s\n",
			(u32) id, a->card,
		       0,
		       0,
		       a->weight, at,
#ifdef CONFIG_LASTMOD
		       a->age,
#else
		       0,
#endif
		       tf
		       );
	}
}

static void
dump_note(u64 id, void *tmp)
{
	struct card_note *n = tmp;
	byte *attrs = "GOI*****";
	byte at[9];

	if (!n)
	{
		bprintf(output, "ID         Area USize  Scn Ook Dyn Mrg %s Footprint\n", attrs);
		return;
	}
	for (uns i=0; i<8; i++)
	  at[i] = (n->flags & (1<<i)) ? attrs[i] : '-';
	at[8] = 0;
	bprintf(output, "%8x %6d %6d %3d %3d %3d %3d %s ", (u32) id,
	       n->useful_size,
	       n->weight_scanner,
	       0,
	       0,
	       n->weight_merged,
	       at);
	for (uns i=0; i<16; i++)
	{
		if (i == 8)
			bputc(output, ':');
		bprintf(output, "%02x", n->footprint[i]);
	}
	bputc(output, '\n');
}

static void
dump_checksum(u64 id, void *tmp)
{
	struct csum *c = tmp;
	int i;
	if (!tmp)
	{
		if (!verbose)
			bprintf(output, "Pos      MD5                              CardID\n");
		return;
	}
	if (verbose)
	{
		bprintf(output, "Checksum at %x:\n", (u32) id);
		bprintf(output, "MD5:\t");
		for (i=0; i<16; i++)
			bprintf(output, "%02x", c->md5[i]);
		bprintf(output, "\n");
		bprintf(output, "Card:\t%08x\n\n", c->cardid);
	}
	else
	{
		bprintf(output, "%08x ", (u32) id);
		for (i=0; i<16; i++)
			bprintf(output, "%02x", c->md5[i]);
		bprintf(output, " %08x\n", c->cardid);
	}
}

static void
dump_fingerprint(u64 id, void *tmp)
{
	struct fprint *c = tmp;
	int i;
	if (!c)
	{
		if (!verbose)
			bprintf(output, "ID       Fingerprint              CardID\n");
		return;
	}
	if (verbose)
	{
		bprintf(output, "Fingerprint at %x:\n", (u32) id);
		bprintf(output, "Hash:\t");
		for (i=0; i<12; i++)
			bprintf(output, "%02x", c->fp.hash[i]);
		bprintf(output, "\n");
		bprintf(output, "Card:\t%08x\n\n", c->cardid);
	}
	else
	{
		bprintf(output, "%08x ", (u32) id);
		for (i=0; i<12; i++)
			bprintf(output, "%02x", c->fp.hash[i]);
		bprintf(output, " %08x\n", c->cardid);
	}
}

static void
dump_signatures(u64 id, void *tmp)
{
	uns *c = tmp;
	uns i;
	if (!tmp)
	{
		if (!verbose)
			bprintf(output, "Pos      CardID   Signatures\n");
		return;
	}
	if (verbose)
	{
		bprintf(output, "Checksum at %x:\n", (u32) id);
		bprintf(output, "Card:\t%08x\n", *c++);
		bprintf(output, "Signatures:\t");
		for (i=0; i<matcher_signatures; i++)
			bprintf(output, "%08x ", c[i]);
		bprintf(output, "\n\n");
	}
	else
	{
		bprintf(output, "%08x ", (u32) id);
		bprintf(output, "%08x", *c++);
		for (i=0; i<matcher_signatures; i++)
			bprintf(output, " %08x", c[i]);
		bprintf(output, "\n");
	}
}

static void
dump_card(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	u32 id = start >> CARD_POS_SHIFT;
	byte *buf;
	int buf_len;
	uns buf_type;

	if (!f)
		return;
	ASSERT(!(start & ((1 << CARD_POS_SHIFT) - 1)));
	if (raw > 1)
	{
		buf_type = bgetl(f);
		if (buf_type < BUCKET_TYPE_PLAIN || buf_type > BUCKET_TYPE_V33_LIZARD)
			die("Cannot parse card %08x: bucket_type=%08x", id, buf_type);
		buf_len = bgetl(f);
		bput_attr_format(output, '#', "## %08x %06d %08x", id, buf_len, buf_type);
		if (buf_type == BUCKET_TYPE_V33_LIZARD)
		{
			uns orig_len = bgetl(f);
			uns adler = bgetl(f);
			bput_attr_format(output, '#', "## orig_len=%d adler=%08x", orig_len, adler);
		}
	}
	else
	{
		buf_len = lizard_bread(liz_buf, f, &buf, &buf_type);
		if (buf_len < 0)
			die("Cannot parse card %08x: %m", id);
	}
	if (raw > 1)
		bbcopy(f, output, buf_len);
	else if (raw)
		bwrite(output, buf, buf_len);
	else
	{
		get_attr_set_type(buf_type);
		put_attr_set_type(BUCKET_TYPE_V30);
		bput_attr_format(output, '#', "## %08x %06d %08x", id, buf_len, buf_type);
		byte *buf_end = buf + buf_len;
		while (1)							/* Dump all attributes.  */
		{
			struct parsed_attr attr;
			int i = get_attr(&buf, buf_end, &attr);
			if (i < 0)
				break;
			if (!i)
			{
				bput_attr_separator(output);
				continue;
			}
			if (!attr.len)
				bput_attr(output, attr.attr, "", 0);
			byte *attr_end = attr.val + attr.len;
			while (attr.len)					/* Wrap into short lines.  */
			{
				uns print = attr.len;
				if (print > LINE_LEN)				/* Do not use line_len, this is for 2nd wrapping done by objdump.  */
				{
					byte *start = attr.val + LINE_LEN, *c = start;
					while (c < attr_end && !Cspace(*c)
					&& (!Cctrl(*c) || c < start + 64)
					&& ((*c >= 0x80 && *c < 0xc0) || c < start + 128))
						c++;
					print = c - attr.val;
				}
				bput_attr(output, attr.attr, attr.val, print);	/* Do not recode, done by objdump.  */
				attr.val += print;
				attr.len -= print;
			}
		}
	}
	while (btell(f) & ((1 << CARD_POS_SHIFT) - 1))
		bgetc(f);
}

struct lab {
	u32 merged_id;
	u32 url_id;
	u32 redir_id;
	u32 count;
} PACKED;

static byte *wt_names[8] = { WORD_TYPE_USER_NAMES };
static byte *mt_names[16] = { META_TYPE_USER_NAMES };

static uns
dump_type_name(struct fastbuf *f, struct fastbuf *output, uns meta)
{
	byte **names = meta ? mt_names : wt_names;
	int c = bgetc(f);
	uns shift = 1;
	ASSERT(c != EOF);
	if (c >= '0' && c <= '3')
	{
		bputc(output, ' ');
		bputc(output, c);
		c = bgetc(f);
		shift++;
	}
	if ((c & 0xf0) != 0x90)
		bprintf(output, " [error=%x]", c);
	bprintf(output, " [%s] ", names[c & 0x0f]);
	return shift;
}

static void
dump_labels(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	struct lab label;

	if (!f)
		return;
	breadb(f, &label, sizeof(struct lab));
	bprintf(output, "Label at %08qx:\n", start);
	bprintf(output, "Merged ID:\t%08x\n", label.merged_id);
	bprintf(output, "URL ID:\t\t%08x\n", label.url_id);
	bprintf(output, "Redirect ID:\t%08x\n", label.redir_id);
	bprintf(output, "Count:\t%d\n", label.count);
	bprintf(output, "Labels:\n\t");
	sh_off_t stop = btell(f) + label.count;
	while (btell(f) < stop)
	{
		uns c = bget_tagged_char(f);
		if (!c)
			bprintf(output, "\n\t");
		else if (c < 0x80000000)
			bput_utf8(output, c);
		else if (c < 0x80001000)
			bprintf(output, " [%s] ", mt_names[c & 0xf]);
		else
			bprintf(output, " [???] ");
	}
	bprintf(output, "\n");
}

static void
dump_labels_id(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;

	if (!f)
		return;

	u32 orig_id = bgetl(f);
	bprintf(output, "Label block at %08qx for URL %08x\n\t", start, orig_id);
	while (1)
	{
		int c;
		c = bgetc(f);
		if (c == EOF)
		{
			bprintf(output, "\n");
			break;
		}
		bputc(output, c);
		uns x;
		while ((x = bget_tagged_char(f)) != 0 && x != ~0U)
		{
			if (x < 0x80000000)
				bput_utf8(output, x);
			else if (x < 0x80001000)
				bprintf(output, " [%s] ", mt_names[x & 0xf]);
			else
				bprintf(output, " [???] ");
		}
		bprintf(output, "\n\t");
		c = bgetc(f);
		if (!c || c == EOF)
			break;
		bungetc(f);
	}
	bprintf(output, "\n");
}

static char *word_classes[] = { "cplx", "ignr", "word", "garb", "ctxt", "brek", "????", "????" };

static void
dump_lex_word(struct fastbuf *f, uns len)
{
  byte word[MAX_WORD_LEN+1];

  ASSERT(len <= MAX_WORD_LEN);
  breadb(f, word, len);
  bwrite(output, word, len);
}

static void
dump_lex_entry(struct lex_entry *l, uns id, struct fastbuf *f)
{
	sh_off_t ref_pos = GET_O(l->ref_pos);
	uns chlen = GET_U16(l->ch_len) << 12;
	id |= l->class;
#ifdef CONFIG_CONTEXTS
	uns ctxt = l->ctxt;
#else
	uns ctxt = 0;
#endif
#ifdef CONFIG_SPELL
	uns freq = l->freq;
#else
	uns freq = 0;
#endif

	if (verbose)
	{
		bprintf(output, "Word ID %x:\n", id);
		bprintf(output, "References:\t%08qx + %07x\n", (long long) ref_pos, chlen);
		bprintf(output, "Class:\t\t%s\n", (l->class > 7 ? "????" : word_classes[l->class]));
		bprintf(output, "Frequency:\t%d\n", freq);
		bprintf(output, "Context:\t%04x\n", ctxt);
		bprintf(output, "Length:\t\t%d\n", l->length);
		bprintf(output, "Word:\t\t");
	}
	else
	{
		bprintf(output, "%8x %8qx+%07x %s %3d %04x %3d ", id, (long long) ref_pos, chlen,
		       (l->class > 7 ? "????" : word_classes[l->class]), freq, ctxt, l->length);
	}
	dump_lex_word(f, l->length);
	bprintf(output, verbose ? "\n\n" : "\n");
}

static void
dump_lexicon(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	uns wordid;

	if (!f)
	{
		if (!verbose)
			bprintf(output, "ID       RefPos+Len       Flgs Frq Ctxt Len Word\n");
		return;
	}
	if (!start)
	{
		u32 wc = bgetl(f);
		u32 cc = bgetl(f);
		bprintf(output, "# Word count:\t%d\n", wc);
		bprintf(output, "# Complex count:\t%d\n", cc);
		wordid = 8;
		for (uns i=0; i<wc+cc; i++)
		{
			struct lex_entry l;
			breadb(f, &l, sizeof(l));
			dump_lex_entry(&l, wordid, f);
			wordid += 8;
		}
	}
	else
		bprintf(output, "# Garbage found after end of lexicon!\n");
}

static void
dump_lex_words(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	static uns wordid;

	if (!f)
	{
		if (!verbose)
			bprintf(output, "ID       RefPos+Len       Flgs Frq Ctxt Len Word\n");
		return;
	}
	if (!start)
	{
		u32 wc = bgetl(f);
		bprintf(output, "# Word count:\t%d\n", wc);
		wordid = 8;
	}
	else
	{
		struct lex_entry l;
		breadb(f, &l, sizeof(l));
		dump_lex_entry(&l, wordid, f);
		wordid += 8;
	}
}

static void
dump_lex_temp(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;

	if (!f)
	{
		if (!verbose)
			bprintf(output, "ID       Count      Flgs Ctxt Len Word\n");
		return;
	}
	if (!start)
	{
		u32 wc = bgetl(f);
		bprintf(output, "# Word count:\t%d\n", wc);
	}
	else
	{
		u32 id = bgetl(f);
		u32 count = bgetl(f);
		uns class = id & 7;
		uns context = bget_context(f);
		uns length = bgetc(f);
		if (verbose)
		{
			bprintf(output, "Word ID %x:\n", id);
			bprintf(output, "Count:\t\t%d\n", count);
			bprintf(output, "Word class:\t\t%s\n", word_classes[class]);
			bprintf(output, "Ctxt class:\t\t%d\n", context);
			bprintf(output, "Length:\t\t%d\n", length);
			bprintf(output, "Word:\t\t");
		}
		else
		{
			bprintf(output, "%8x %10d %s %04x %3d ", id, count, word_classes[class], context, length);
		}
		dump_lex_word(f, length);
		bprintf(output, verbose ? "\n\n" : "\n");
	}
}

static void
dump_lex_stats(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;

	if (!f || start)
		return;

	enum wtype { TOTAL, BARE, ACCENTED, DIGITS, MIXED, NTYPES };
	byte *tnames[] = { "Total", "Bare", "Accented", "Numbers", "Mixed" };
	uns wcnt[MAX_WORD_LEN+1][NTYPES];
	uns wlen[MAX_WORD_LEN+1][NTYPES];
	uns cnt = bgetl(f);
	bzero(wcnt, sizeof(wcnt));
	bzero(wlen, sizeof(wlen));
	while (cnt--)
	{
		bgetl(f);
		bgetl(f);
		bget_context(f);
		uns len = bgetc(f);
		uns reclen = 9 + len;
		ASSERT(len && len <= MAX_WORD_LEN);
		byte buf[MAX_WORD_LEN];
		bread(f, buf, len);
		int alpha = 0;
		int digits = 0;
		int accents = 0;
		for (byte *x=buf; x<buf+len;)
		{
			uns c;
			GET_UTF8(x, c);
			if (c >= '0' && c <= '9')
				digits++;
			else
				alpha++;
			if (Uunaccent(c) != c)
				accents++;
		}
		enum wtype t;
		if (!digits) {
			if (!accents)
				t = BARE;
			else
				t = ACCENTED;
		} else {
			if (alpha)
				t = MIXED;
			else
				t = DIGITS;
		}
		len = digits+alpha;
		wcnt[len][t]++;
		wlen[len][t] += reclen;
		wcnt[len][TOTAL]++;
		wlen[len][TOTAL] += reclen;
		wcnt[0][t]++;
		wlen[0][t] += reclen;
		wcnt[0][TOTAL]++;
		wlen[0][TOTAL] += reclen;
	}
	bputs(output, "Words:\n");
	bprintf(output, "Len ");
	for (uns j=0; j<NTYPES; j++)
		bprintf(output, " %11s Cnt/Len", tnames[j]);
	bputc(output, '\n');
	for (uns i=0; i<=MAX_WORD_LEN; i++)
	{
		if (!i)
			bprintf(output, "Sum:");
		else
			bprintf(output, "%3d:", i);
		for (uns j=0; j<NTYPES; j++)
			bprintf(output, " %9d/%9d", wcnt[i][j], wlen[i][j]);
		bputc(output, '\n');
	}
}

static void
dump_graph(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	uns src, deg;
	byte *vtypes[4] = { "", " [redir]", " [frame]", " [img]" };

	if (!f)
		return;
	src = bgetl(f);
	deg = bgetw(f);
	bprintf(output, "Vertex %x (degree %d) at %08qx:\n", src, deg, start);
	while (deg--)
	{
		u32 x = bgetl(f);
		bprintf(output, "\t-> %x%s\n",
		       x & ~ETYPE_MASK,
		       vtypes[x >> 30U]);
	}
}

static void
dump_graph_index(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	if (!f)
		return;
	sh_off_t ofs = bgeto(f);
	bprintf(output, "%08x -> %08qx\n", (uns) start / BYTES_PER_O, (u64) ofs);
}

static void
dump_u32(u64 id, void *tmp)
{
	u32 *u = tmp;
	if (!u)
		return;
	bprintf(output, "%08x -> %08x\n", (u32) id, *u);
}

static void
dump_refs(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	u16 hi, lo;

	if (!f)
	{
		if (!verbose)
			bprintf(output, "Pos      OID      Count ref\n");
		return;
	}
	if (verbose)
		bprintf(output, "Reference block at %08qx:\n", start);
	hi = bgetw(f);
	while (1)
	{
		lo = bgetw(f);
		if (!hi && !lo)
			break;
		uns cnt = bgetw(f);
		if (verbose)
			bprintf(output, "OID:\t%08x\nCount:\t%d\n\t", hi<<16 | lo, cnt);
		else
			bprintf(output, "%8qx %8x %5d ", (long long)btell(f)-6, hi<<16 | lo, cnt);
		while (1)
		{
			word ref = bgetw(f);
			if (!(ref >> 12))
			{
				hi = ref;
				ASSERT(!cnt);
				break;
			}
			if (verbose)
			{
				if (ref < 0x8000)
					bprintf(output, "%04x(%03x:%s) ", ref, (ref & 0xfff), wt_names[ ref >> 12 ]);
				else
					bprintf(output, "%04x(%03x:%s:%d) ", ref, ((ref >> 2) & 0x1ff), mt_names[ ((ref >> 11) & 0xf) ], ref & 3);
			}
			else
				bprintf(output, "%04x ", ref);
			cnt--;
		}
		bprintf(output, "\n");
	}
	bprintf(output, "\n");
}

static void
dump_ref_texts(u64 start UNUSED, void *tmp)
{
	struct fastbuf *f = tmp;
	uns i, l;
	struct fingerprint fp;

	if (!f)
	{
		if (!verbose)
			bprintf(output, "ID       FingerPrint              Text\n");
		return;
	}
	bprintf(output, "%08x ", bgetl(f));
	breadb(f, &fp, sizeof(fp));
	for (i=0; i<12; i++)
		bprintf(output, "%02x", fp.hash[i]);
	l = bgetw(f);
	l -= dump_type_name(f, output, 1);
	while (l--) {
		i = bgetc(f);
		bputc(output, i);
	}
	bputc(output, '\n');
}

static void
dump_ascii(u64 start UNUSED, void *tmp)
{
	struct fastbuf *f = tmp;
	byte line[BUFSIZE];

	if (!f)
		return;
	bgets(f, line, BUFSIZE);
	bputs(output, line);
	bputc(output, '\n');
}

static void
dump_urls(u64 start UNUSED, void *tmp)
{
	struct fastbuf *f = tmp;
	byte line[BUFSIZE];
	static uns oid;

	if (!f) {
		oid = 0;
		return;
	}
	bgets(f, line, BUFSIZE);
	bprintf(output, "%08x %s\n", oid, line);
	oid++;
}

static void
dump_string_index(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	struct fingerprint fp;
	u32 size;
	uns i;

	if (!f)
		return;
	breadb(f, &fp, sizeof(struct fingerprint));
	breadb(f, &size, sizeof(u32));
	if (verbose)
	{
		bprintf(output, "String index entry at %08qx:\n", start);
		bprintf(output, "Finger:\t");
		for (i=0; i<12; i++)
			bprintf(output, "%02x", fp.hash[i]);
		bprintf(output, "\n");
		bprintf(output, "Size:\t%d\n", size);
	}
	else
	{
		bprintf(output, "S%8qx ", start);
		for (i=0; i<12; i++)
			bprintf(output, "%02x", fp.hash[i]);
		bprintf(output, " %8d\n", size);
	}
	while (size > 0)
	{
		u32 oid;
		u16 count;
		u64 estart = btell(f);
		oid = bgetl(f);
		count = bgetw(f);
		size -= sizeof(u32) + (count+1)*sizeof(u16);
		if (verbose)
		{
			bprintf(output, "OID:\t%08x\n", oid);
			bprintf(output, "Count:\t%d\n", count);
		}
		else
		{
			bprintf(output, "O%8qx %8x %5d  ", estart, oid, count);
		}
		for (i=0; i<count; i++)
		{
			u16 ref = bgetw(f);
			if (verbose)
				bprintf(output, "Ref:\t%04x\n", ref);
			else
				bprintf(output, "%04x ", ref);
		}
		if (!verbose)
			bprintf(output, "\n");
	}
	bprintf(output, "\n");
}

static void
dump_string_map(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	struct fingerprint fp;
	sh_off_t ref_pos;
	uns i;

	if (!f)
	{
		if (!verbose)
			bprintf(output, "Pos      Fingerprint              RefPos\n");
		return;
	}
	breadb(f, &fp, sizeof(struct fingerprint));
	ref_pos = bgeto(f);
	if (verbose)
	{
		bprintf(output, "String map entry at %08qx:\n", start);
		bprintf(output, "Finger:\t");
		for (i=0; i<12; i++)
			bprintf(output, "%02x", fp.hash[i]);
		bprintf(output, "\n");
		bprintf(output, "RefPos:\t%08qx\n\n", (long long) ref_pos);
	}
	else
	{
		bprintf(output, "%8qx ", start);
		for (i=0; i<12; i++)
			bprintf(output, "%02x", fp.hash[i]);
		bprintf(output, " %08qx\n", (long long) ref_pos);
	}
}

static void
dump_word_index(u64 start, void *tmp)
{
	struct fastbuf *f = tmp;
	u32 wordid, size;
	uns i;

	if (!f)
		return;
	breadb(f, &wordid, sizeof(u32));
	breadb(f, &size, sizeof(u32));
	if (verbose)
	{
		bprintf(output, "Word index entry at %08qx:\n", start);
		bprintf(output, "Word:\t%x\n", wordid);
		bprintf(output, "Size:\t%d\n", size);
	}
	else
	{
		bprintf(output, "S%8qx %8x %8d\n", start, wordid, size);
	}
	while (size > 0)
	{
		u32 oid;
		u16 count;
		u64 estart = btell(f);
		oid = bgetl(f);
		count = bgetw(f);
		size -= sizeof(u32) + (count+1)*sizeof(u16);
		if (verbose)
		{
			bprintf(output, "OID:\t%08x\n", oid);
			bprintf(output, "Count:\t%d\n", count);
		}
		else
		{
			bprintf(output, "O%8qx %8x %5d  ", estart, oid, count);
		}
		for (i=0; i<count; i++)
		{
			u16 ref = bgetw(f);
			if (verbose)
				bprintf(output, "Ref:\t%04x\n", ref);
			else
				bprintf(output, "%04x ", ref);
		}
		if (!verbose)
			bprintf(output, "\n");
	}
	bprintf(output, "\n");
}

static void
dump_params(u64 start UNUSED, void *tmp)
{
	struct index_params *par = tmp;

	if (!par)
		return;
	bprintf(output, "Index version:\t\t%08x\n", par->version);
	bprintf(output, "Reference time:\t\t%d\n", (uns) par->ref_time);
	bprintf(output, "Lexicon config:\n");
	bprintf(output, "\tmin_len_ign\t%d\n", par->lex_config.min_len_ign);
	bprintf(output, "\tmin_len\t\t%d\n", par->lex_config.min_len);
	bprintf(output, "\tmax_len\t\t%d\n", par->lex_config.max_len);
	bprintf(output, "\tmax_hex_len\t%d\n", par->lex_config.max_hex_len);
	bprintf(output, "\tmax_ctrl_len\t%d\n", par->lex_config.max_ctrl_len);
	bprintf(output, "\tmax_gap\t\t%d\n", par->lex_config.max_gap);
	bprintf(output, "\tcontext_slots\t%d\n", par->lex_config.context_slots);
	bprintf(output, "Input objects:\t\t%d\n", par->objects_in);
}

static void
dump_stems_temp(u64 start UNUSED, void *tmp)
{
	struct fastbuf *f = tmp;

	if (!f)
		return;
	uns sid = bgetl(f);
	u32 lm = bgetl(f);
	bprintf(output, "# Stem expansion table for stemmer #%08x langmask %08x\n", sid, lm);
	u32 x, y;
	while ((x = bgetl(f)) != ~0U)
	{
		y = bgetl(f);
		bprintf(output, "%08x %08x\n", x, y);
	}
}

static void
dump_stems(u64 start UNUSED, void *tmp)
{
	struct fastbuf *f = tmp;

	if (!f)
		return;
	uns sid = bgetl(f);
	u32 lm = bgetl(f);
	bprintf(output, "# Stem expansion table for stemmer #%08x langmask %08x\n", sid, lm);
	u32 x;
	while ((x = bgetl(f)) != ~0U)
	{
		if (x & 0x80000000)
			bprintf(output, "%08x\n", x & 0x7fffffff);
		else
			bprintf(output, "\t%08x\n", x);
	}
}

static void
dump_feedback_gath(u64 id, void *tmp)
{
	struct feedback_gatherer *f = tmp;
	byte *attrs = "GOI*****";
	byte at[9];

	if (!f)
	{
		bprintf(output, "ID       Footprint                         CardID   Flags    Dyn\n");
		return;
	}
	for (uns i=0; i<8; i++)
		at[i] = (f->flags & (1<<i)) ? attrs[i] : '-';
	at[8] = 0;
	bprintf(output, "%08x ", id);
	for (uns i=0; i<16; i++)
	{
		if (i == 8)
			bputc(output, ':');
		bprintf(output, "%02x", f->footprint[i]);
	}
	bprintf(output, " %08x %s %d\n", f->cardid, at, f->weight);
}

struct ule {
	uns pos;
	char x[1];
};

#define HASH_NODE struct ule
#define HASH_PREFIX(x) ule_##x
#define HASH_KEY_ENDSTRING x
#define HASH_WANT_FIND
#define HASH_WANT_NEW
#include "lib/hashtable.h"

static void
lookup_urls(struct fastbuf *b, int argc, char **argv)
{
	byte buf1[MAX_URL_SIZE], line[MAX_ATTR_SIZE];
	if (!argc)
	{
		fprintf(stderr, "Nothing to do.\n");
		exit(1);
	}
	ule_init();
	for (int i=0; i<argc; i++)
	{
		int err = url_auto_canonicalize(argv[i], buf1);
		if (err)
		{
			fprintf(stderr, "Invalid URL %s: %s\n", argv[i], url_error(err));
			exit(1);
		}
		if (!ule_find(buf1))
		{
			struct ule *e = ule_new(buf1);
			e->pos = ~0U;
		}
	}
	uns cnt = 0;
	uns remains = argc;
	while (bgets(b, line, sizeof(line)))
	{
		struct ule *e = ule_find(line);
		if (e && e->pos == ~0U)
		{
			e->pos = cnt;
			if (!--remains)
				break;
		}
		cnt++;
	}
	for (int i=0; i<argc; i++)
	{
		url_auto_canonicalize(argv[i], buf1);
		struct ule *e = ule_find(buf1);
		ASSERT(e);
		if (e->pos != ~0U)
			bprintf(output, "%08x %s\n", e->pos, e->x);
		else
			bprintf(output, "-------- %s\n", e->x);
	}
}

struct index_file {
	int option;
	byte **default_filename;
	int record_size;
		/* 
		 * record_size > 0:	uniform sequence with records of size record_size
		 * record_size == 0:	non-uniform sequence
		 * record_size < 0:	non-uniform sequence with records aligned to -record_size
		 */
	void (*dump_single)(u64 id, void *in);
};

static struct index_file index_files[] = {
	{ 'a', &fn_attributes, 		sizeof(struct card_attr),	dump_card_attr	},
	{ 'c', &fn_cards,		-(1 << CARD_POS_SHIFT),		dump_card	},
	{ 'd', &fn_card_attrs,		sizeof(struct card_attr),	dump_card_attr	},
	{ 'h', &fn_checksums,		sizeof(struct csum),		dump_checksum	},
	{ 'F', &fn_fingerprints,	sizeof(struct fprint),		dump_fingerprint},
	{ 'l', &fn_labels,		0,				dump_labels	},
	{ 'L', &fn_labels_by_id,	0,				dump_labels_id	},
	{ 'x', &fn_lexicon,		0,				dump_lexicon	},
	{ 'Q', &fn_lex_by_freq,		0,				dump_lex_temp	},
	{ 'R', &fn_lex_raw,		0,				dump_lex_temp	},
	{ 'T', &fn_lex_raw,		0,				dump_lex_stats	},
	{ 'O', &fn_lex_ordered,		0,				dump_lex_temp	},
	{ 'W', &fn_lex_words,		0,				dump_lex_words	},
	{ 'k', &fn_links,		sizeof(struct fprint),		dump_fingerprint},
	{ 'g', &fn_link_graph,		0,				dump_graph	},
	{ 'G', &fn_link_graph_index,	-BYTES_PER_O,			dump_graph_index},
	{ 'm', &fn_merges,		sizeof(u32),			dump_u32	},
	{ 'n', &fn_notes,		sizeof(struct card_note),	dump_note	},
	{ 'N', &fn_notes_new,		sizeof(struct card_note),	dump_note	},
	{ 'p', &fn_parameters,		sizeof(struct index_params),	dump_params	},
	{ 'P', &fn_card_prints,		sizeof(struct card_print),	dump_fingerprint},
	{ 'r', &fn_references,		0,				dump_refs	},
	{ 't', &fn_ref_texts,		0,				dump_ref_texts	},
	{ 's', &fn_sites,		0,				dump_ascii	},
	{ 'U', &fn_signatures,		0, /* computed run-time */	dump_signatures	},
	{ 'y', &fn_stems,		0,				dump_stems	},
	{ 'Y', &fn_stems_ordered,	0,				dump_stems_temp	},
	{ 'H', &fn_string_hash,		sizeof(u32),			dump_u32	},
	{ 'I', &fn_string_index,	0,				dump_string_index },
	{ 'M', &fn_string_map,		0,				dump_string_map },
	{ 'u', &fn_urls,		0,				dump_urls	},
	{ 'q', &fn_urls,		0,				NULL	 	},	/* kludge */
	{ 'w', &fn_word_index,		0,				dump_word_index },
	{ 'B', &fn_feedback_gath,	sizeof(struct feedback_gatherer),	dump_feedback_gath },
	{ 0,   NULL,			0,				NULL		}
};

static char *shortopts = CF_SHORT_OPTS "0abcdf:ghklmnpqrstuvwxyBFGHILMNOQPRTUWY";
static struct option longopts[] =
{
	CF_LONG_OPTS
	{ "raw",		0, 0, '0' },
	{ "bare",		0, 0, 'b' },
	{ "filename",		0, 0, 'f' },
	{ "verbose",		0, 0, 'v' },
	{ "attr",		0, 0, 'a' },
	{ "card",		0, 0, 'c' },
	{ "card-attr",		0, 0, 'd' },
	{ "checksum",		0, 0, 'h' },
	{ "finger",		0, 0, 'F' },
	{ "label",		0, 0, 'l' },
	{ "labels-id",		0, 0, 'L' },
	{ "lexicon",		0, 0, 'x' },
	{ "lexicon-by-freq",	0, 0, 'Q' },
	{ "lexicon-ordered",	0, 0, 'O' },
	{ "lexicon-raw",	0, 0, 'R' },
	{ "lexicon-stats",	0, 0, 'T' },
	{ "lexicon-words",	0, 0, 'W' },
	{ "link",		0, 0, 'k' },
	{ "link-graph",		0, 0, 'g' },
	{ "link-graph-index",	0, 0, 'G' },
	{ "merge",		0, 0, 'm' },
	{ "notes",		0, 0, 'n' },
	{ "notes-new",		0, 0, 'N' },
	{ "parameters",		0, 0, 'p' },
	{ "prints",		0, 0, 'P' },
	{ "reference",		0, 0, 'r' },
	{ "reference-texts",	0, 0, 't' },
	{ "site",		0, 0, 's' },
	{ "signature",		0, 0, 'U' },
	{ "stems",		0, 0, 'y' },
	{ "stems-ordered",	0, 0, 'Y' },
	{ "string-hash",	0, 0, 'H' },
	{ "string-index",	0, 0, 'I' },
	{ "string-map",		0, 0, 'M' },
	{ "url",		0, 0, 'u' },
	{ "url-lookup",		0, 0, 'q' },
	{ "word",		0, 0, 'w' },
	{ "feedback-gath",	0, 0, 'B' },
	{ NULL,			0, 0, 0 }
};

static char *help = "\
Usage: idxdump [<options>] <index-file> [<id> | [<first-id>]-[<last-id>]]\n\
\n\
Options:\n"
CF_USAGE
"-b, --bare\t\tDon't print table heading\n\
-f, --filename\t\tOverride default filename for given index file\n\
-v, --verbose\t\tSet verbose mode\n\
-0, --raw\t\tDump raw card after decompression\n\n\
Index files:\n\
-a, --attr\t\tAttributes\n\
-c, --card\t\tCards\n\
-d, --card-attr\t\tCard attributes\n\
-h, --checksum\t\tChecksums\n\
-B, --feedback-gath\tFeedback to the gatherer\n\
-F, --finger\t\tFingerprints\n\
-l, --label\t\tLabels\n\
-L, --labels-id\t\tLabels by id\n\
-x, --lexicon\t\tFinal lexicon\n\
-Q, --lexicon-by-freq\tLexicon sorted by frequency\n\
-O, --lexicon-ordered\tLexicon after lexorder\n\
-W, --lexicon-words\tLexicon after wsort\n\
-R, --lexicon-raw\tRaw lexicon\n\
-T, --lexicon-stats\tRaw lexicon statistics\n\
-k, --link\t\tLinks by url\n\
-g, --link-graph\tLink graph\n\
-G, --link-graph-index\tIndices to the link graph\n\
-m, --merge\t\tMerges\n\
-n, --notes\t\tNotes\n\
-N, --notes-new\t\tNotes for not downloaded documents\n\
-p, --parameters\tIndex parameters\n\
-P, --prints\t\tCard fingerprints\n\
-r, --reference\t\tReferences\n\
-t, --reftexts\t\tReference texts\n\
-U, --signature\t\tSignatures\n\
-s, --site\t\tSites\n\
-y, --stems\t\tStem mappings\n\
-Y, --stems-ordered\tTemporary stem mappings from lexorder\n\
-H, --string-hash\tString hash\n\
-I, --string-index\tString index\n\
-M, --string-map\tString map\n\
-u, --url\t\tUrl list\n\
-q, --url-lookup\tFind given URL's in the URL list\n\
-w, --word\t\tWord index\n\
";

static void NONRET
usage(byte *msg)
{
	if (msg)
	{
		fputs(msg, stderr);
		fputc('\n', stderr);
	}
	fputs(help, stderr);
	exit(1);
}

static u64
xtol64(byte *c)
{
	u64 x = 0;
	int n = 0;

	while (*c)
	{
		if (++n > 16)
			die("Number too long");
		if (!Cxdigit(*c))
			die("Invalid hexadecimal number");
		x = (x << 4) | Cxvalue(*c);
		c++;
	}
	return x;
}

static void
dump_index_interval(struct index_file *f, struct fastbuf *b, u64 start, u64 stop)
{
	if (f->record_size > 0)
	{
		byte *buf = alloca(f->record_size);
		bsetpos(b, f->record_size * start);
		while (start <= stop)
		{
			if (!breadb(b, buf, f->record_size))
				break;
			f->dump_single(start, buf);
			start++;
		}
	}
	else
	{
		if (f->record_size < 0)
		{
			start *= -f->record_size;
			if (stop != ~0ULL)
				stop *= -f->record_size;
		}
		if (stop == ~0ULL)
		{
			bseek(b, 0, SEEK_END);
			stop = btell(b);
			if (!stop)
				return;
			stop--;
		}
		bsetpos(b, start);
		while ((u64) (start = btell(b)) <= stop)
			f->dump_single(start, b);
	}
}

static void
dump_index_file(struct index_file *f, struct fastbuf *b, int argc, char **argv)
{
	int i;

	if (f->option == 'U')
	{
		/* The file dumped is just a temporary file of the process of
		 * building the index, hence we ignore the contingency that
		 * matcher_signatures could have changed.  The user can modify
		 * its value by setting -Smatcher.signatures anyway.  */
		f->record_size = sizeof(uns) + matcher_signatures * sizeof(u32);
	}
	else if (f->option == 'q')
	{
		/* The URL lookup function has non-standard arguments */
		lookup_urls(b, argc, argv);
		return;
	}
	if (!bare)
		f->dump_single(0, NULL);
	if (argc == 0)
		dump_index_interval(f, b, 0, ~0ULL);
	else
		for (i=0; i<argc; i++)
		{
			u64 start = 0;
			u64 stop = ~0ULL;
			byte *c = strchr(argv[i], '-');
			if (c)
			{
				*c++ = 0;
				if (*c)
					stop = xtol64(c);
				if (*argv[i])
					start = xtol64(argv[i]);
			}
			else
				start = stop = xtol64(argv[i]);
			if (start > stop)
				usage("Invalid ID range");
			dump_index_interval(f, b, start, stop);
		}
}

int
main(int argc, char **argv)
{
	byte *force_filename = NULL;
	struct index_file *f = NULL;
	int opt, i;
	struct fastbuf *b, *output_stdout;

	log_init(argv[0]);
	while ((opt = cf_getopt(argc, argv, shortopts, longopts, NULL)) >= 0)
		switch (opt)
		{
			case 'f':
				force_filename = optarg;
				break;
			case 'v':
				verbose++;
				break;
			case 'b':
				bare++;
				break;
			case '0':
				raw++;
				break;
			default:
				for (i=0; ; i++)
					if (opt == index_files[i].option)
					{
						if (f)
							usage("More index files specified");
						f = &index_files[i];
						break;
					}
					else if (!index_files[i].option)
						usage("Invalid option");
		}
	if (!f)
		usage("No index file specified");
	if (!force_filename)
		force_filename = index_name(*f->default_filename);
	term_charset_id = find_charset_by_name(terminal_charset);
	if (term_charset_id < 0)
		die("Unknown terminal charset %s", terminal_charset);
	output_stdout = output = bfdopen(1, 4096);
	if (f->option != 'c')	// handled by objdump
		output = fb_wrap_charconv_out(output, CONV_CHARSET_UTF8, term_charset_id);
	else
		liz_buf = lizard_alloc();
	b = bopen(force_filename, O_RDONLY, 1<<20);
	dump_index_file(f, b, argc - optind, argv + optind);
	if (liz_buf)
		lizard_free(liz_buf);
	bclose(b);
	if(output != output_stdout)
		bclose(output);
	bclose(output_stdout);
	return 0;
}
