/*
 *	Sherlock Indexer
 *
 *	(c) 2001--2006 Martin Mares <mj@ucw.cz>
 *	(c) 2002--2005 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/index.h"
#include "lib/clists.h"

/* iconfig.c */

/* File names */
extern byte *fn_directory;
extern byte *fn_source;
extern byte *fn_incremental;
extern byte *fn_fingerprints, *fn_labels_by_id, *fn_attributes, *fn_checksums;
extern byte *fn_links, *fn_urls, *fn_link_graph, *fn_link_graph_index, *fn_sites, *fn_labels, *fn_merges, *fn_signatures, *fn_matches;
extern byte *fn_word_index, *fn_string_index, *fn_references, *fn_string_map, *fn_card_prints;
extern byte *fn_string_hash, *fn_cards, *fn_card_attrs, *fn_parameters, *fn_ref_texts;
extern byte *fn_lexicon, *fn_lex_raw, *fn_lex_ordered, *fn_lex_words, *fn_lex_by_freq;
extern byte *fn_stems, *fn_stems_ordered, *fn_lex_classes, *fn_notes, *fn_notes_new, *fn_keywords, *fn_feedback_gath;
extern byte *fn_blacklist;
extern uns default_weight;

byte *index_name(byte *file);
int index_name_defined(byte *file);
#define index_bopen(file, flags) bopen(index_name(file), flags, indexer_fb_size)
#define index_maybe_bopen(file, flags) (index_name_defined(file) ? index_bopen(file, flags) : NULL)

/* Miscellaneous */
extern struct attr_set label_attr_set, link_attr_set, ref_link_attr_set;
extern struct attr_set override_label_attr_set, override_body_attr_set, card_attr_set;
extern uns string_avg_bucket, indexer_fb_size, sort_delete_src, max_degree;
extern uns progress, progress_screen, progress_status_line;
extern uns ref_max_length, ref_min_length, ref_max_count;
extern uns matcher_signatures, matcher_context, matcher_min_words, matcher_threshold, matcher_passes, matcher_block;
extern uns max_num_objects, min_summed_size, frameset_to_redir;
extern uns raw_stage2_input;
extern uns indexer_trace;

/* Filters */
extern byte *indexer_filter_name;

#define PROGRESS(i, msg, args...) do { if (progress && !((i) % progress)) { \
	if (progress_status_line) setproctitle(msg, args); \
	if (progress_screen) { printf(msg "\r", args); fflush(stdout); } } } while (0)

#define ITRACEN(level, msg, args...) do { if (indexer_trace >= level) log(L_DEBUG, msg,##args); } while(0)
#define ITRACE(msg, args...) ITRACEN(1, msg,##args)

/* Subindices */

#define HARD_MAX_SUBINDICES 4		/* Need to update INDEX_ID_SHIFT in chewer.c when increasing this */

struct subindex {
  struct cnode n;
  byte *name;
  uns type_mask;
  uns id_mask;
};

extern struct clist subindices;

/* getbuck.c */

struct mempool;
struct buck2obj_buf;
struct bucket_source {
  u32 oid;				/* oid of the current bucket */
  u32 type;				/* type of the current bucket */
  struct odes *o;			/* parsed content of the current bucket */
  int (*get_next)(struct bucket_source *src, struct mempool *mp, u32 oid); /* Get bucket with the given oid or the next one if oid == ~0U */
  void (*cleanup)(struct bucket_source *src);
  uns progress_current, progress_max;
  struct buck2obj_buf *buck_buf;
  struct fastbuf *in_file;
};

struct bucket_source *get_buck_init(void);

/* Structure of files */

struct csum {
  byte md5[16];
  u32 cardid;
};

#include "indexer/sites.h"

struct card_note {
  u32 useful_size;			/* Useful size (number of alnum characters) */
  area_t area;				/* We need the area for non-downloaded entries as well */
  s16 card_bonus;			/* Bonus assigned by the filter */
  /* These fields track how did the card weight evolve */
  byte weight_scanner;			/* Weight assigned by the scanner */
  byte weight_merged;			/* Weight after card merging (includes merger penalties) */
  byte flags;				/* CARD_NOTE_xxx */
  site_hash_t site_hash;		/* Hash of the site name */
  byte footprint[16];
};

enum card_note_flag {
  CARD_NOTE_GIANT = 1,			/* Belongs to a very large class, subject to penalties */
  CARD_NOTE_HAS_LINKS = 2,		/* even the unknown ones */
  CARD_NOTE_IS_LINKED = 4,
  CARD_NOTE_IMAGE = 8,			/* Is an image object [set by scanner] */
};

static inline uns			/* id_mask works according to these id's */
get_subindexing_id(uns card_id UNUSED, struct card_note *note)
{
  for (uns i=0; i<sizeof(note->footprint); i++)
    if (note->footprint[i])
      return note->footprint[0] & 0x0f;
  return card_id & 0x0f;
}

typedef float rank_t;

/* Labels */

struct lab {				/* Header of a label block */
  u32 merged_id, url_id, redir_id;
  u32 count;
  byte flags;
} PACKED;

#define LABEL_TYPE_BODY		0x01	/* Will be attached to card body (or ignored if the card is empty/dup) */
#define LABEL_TYPE_URL		0x02	/* Will be attached to a per-URL block */
#define LABEL_FLAG_MERGED_ONLY	0x04	/* Ignore if the card isn't merged */
#define LABEL_FLAG_OVERRIDE    	0x08	/* Override attribute of the same name in the card instead of appending */

/* access.c -- helper function for access to indexer data structures */

#include "lib/partmap.h"

extern uns card_count;			/* Number of cards in the index */
extern uns new_count;			/* Number of new URL's in the index */

void set_card_count(uns cc);
void set_new_count(uns nc);

extern struct partmap *notes_partmap, *notes_new_partmap;

void notes_part_map(uns rw);
void notes_part_unmap(void);
void notes_new_part_map(uns rw);
void notes_new_part_unmap(void);

static inline struct card_note *
bring_note_from(struct partmap *map, oid_t card)
{
  return partmap_map(map, sizeof(struct card_note) * (sh_off_t)card, sizeof(struct card_note));
}

static inline struct card_note *
bring_note(oid_t card)
{
  return bring_note_from(notes_partmap, card);
}

static inline struct card_note *
bring_new_note(oid_t card)
{
  return bring_note_from(notes_new_partmap, card);
}

extern struct card_attr *attrs;
extern struct partmap *attrs_partmap;

void attrs_map(uns rw);
void attrs_unmap(void);
void attrs_part_map(uns rw);
void attrs_part_unmap(void);

static inline struct card_attr *
bring_attr(oid_t card)
{
  return partmap_map(attrs_partmap, sizeof(struct card_attr) * (sh_off_t)card, sizeof(struct card_attr));
}

#define READ_ATTR(var, field) do {						\
	ASSERT(sizeof(*var) == sizeof(((struct card_attr *)0)->field));		\
	var = xmalloc(card_count * sizeof(*var));				\
	for (uns i=0; i<card_count; i++)					\
	  var[i] = bring_attr(i)->field;					\
	} while(0)

#define READ_ATTR_BIT(var, field, mask) do {					\
	var = xmalloc(BIT_ARRAY_BYTES(card_count));				\
	bit_array_zero(var, card_count);					\
	for (uns i=0; i<card_count; i++)					\
	  if (bring_attr(i)->field & (mask))					\
	    bit_array_set(var, i);						\
	} while(0)

#define WRITE_ATTR(var, field) do { 						\
	for (uns i=0; i<card_count; i++)					\
	  bring_attr(i)->field = var[i];					\
	} while(0)

struct index_params;
void params_load(struct index_params *params);
void params_save(struct index_params *params);

/* fetch.c */

extern uns fetch_id;
void fetch_cards(void (*got_card)(struct card_attr *attr, struct odes *obj, struct card_note *note));

/* resolve.c */

struct resolve_output {
  u32 src;
  u32 dest;
};

#define RESOLVE_SKIP_UNKNOWN	1
#define RESOLVE_SKIP_NEW	2

sh_off_t resolve_optimize_run_length(struct fastbuf *in);
struct fastbuf *resolve_fastbuf(struct fastbuf *in, uns flags, uns add_size);
struct fastbuf *resolve_fingerprints(struct fastbuf *in, uns flags, uns mask);

#define	FIRST_ID_NEW	0x20000000	/* Not downloaded documents are numbered from this ID (no interference with ETYPE_*) */

/* feedback-gath.c */

struct feedback_gatherer {
  byte footprint[16];
  uns cardid;
  byte flags;				/* the same as card_note.flags */
  byte weight;
};

/* Graph edge types */

#define ETYPE_NORMAL 0
#define ETYPE_REDIRECT 0x40000000
#define ETYPE_FRAME 0x80000000
#define ETYPE_IMAGE 0xc0000000
#define ETYPE_MASK 0xc0000000

/* Incremental scanner */

struct scanner_note {
  byte footprint[16];
#ifdef CONFIG_LANG
  byte type_flags;			/* type flags, including detected language */
#endif
};
