/*
 *	Sherlock Search Engine
 *
 *	(c) 1997--2005 Martin Mares <mj@ucw.cz>
 */

#include "lib/clists.h"
#include "sherlock/index.h"

#define PROFILE_TOD
#include "lib/profile.h"

/* compile-time parameters */

#define MAX_PHRASE_LEN 8	/* Must be a power of two */
#define HARD_MAX_WORDS 32	/* Limited to 32 by refs.c */
#define HARD_MAX_NOTES 6
#define HARD_MAX_SYNONYMA 63	/* Limited to 63 by bits in syn_expand */

/* config.c */

extern byte *log_name, *status_name;
extern uns log_incoming, log_rejected, log_requests, log_replies;
extern uns port, listen_queue, connection_timeout, hydra_processes;
extern byte *control_password;
extern struct database *databases;
extern struct spell_pair *spell_common_pairs;
extern uns num_matches, cache_size, max_output_matches;
extern uns max_words, max_word_matches, max_phrases, max_nears, max_bools;
extern uns global_accent_mode, max_wildcard_zone, min_wildcard_prefix_len;
extern uns global_context_chars, global_title_chars, global_intervals, global_site_max;
extern uns global_url_max, global_redirect_url_max, global_morphing, global_spelling, global_synonyming;
extern uns doc_weight_scale, word_weight_scale, word_bonus;
extern uns global_allow_approx, prox_limit, prox_penalty;
extern uns global_partial_answers, default_word_types, global_debug, global_sorting, global_sort_reverse;
extern uns mem_map_zone_size, mem_map_elide_gaps, mem_map_prefetch;
extern uns query_watchdog, second_best_reduce;
extern uns magic_complexes, magic_merge_words, magic_merge_classes, magic_near, magic_merge_bonus;
extern uns near_bonus_word, near_penalty_gap, near_bonus_connect;
extern uns blind_match_penalty, misaccent_penalty, stem_penalty, morph_penalty, synonymum_penalty;
extern uns spell_good_freq, spell_min_len, spell_margin, spell_dwarf, spell_dwarf_margin, global_syn_expand, spell_common_penalty;
extern uns spell_add_penalty, spell_del_penalty, spell_mod_penalty, spell_xpos_penalty, spell_accent_penalty;
extern struct ipaccess_list *access_list;

struct database {
  struct database *next;
  byte *name;
  int is_optional;

  /* Parameters */
  byte *fn_params;
  struct index_params *params;

  /* Cards */
  oid_t num_ids;
  byte *fn_cards;
  byte *fn_card_attrs;
  byte *fn_references;
  int fd_cards, fd_refs;
  struct card_attr *card_attrs, *card_attrs_end;
  sh_off_t card_file_size, ref_file_size;

  /* Words */
  uns word_weights[8], meta_weights[16];
  byte *fn_lexicon, *fn_stems;
  uns lexicon_words, lexicon_complexes, lexicon_file_size;
  struct lex_entry **lex_array;
  uns lex_by_len[MAX_WORD_LEN+2];
  struct lex_entry ***cplx_array;
  uns stems_file_size;
  clist stem_block_list, syn_block_list;

  /* Strings */
  uns string_weights[8];
  byte *fn_string_map;
  byte *fn_string_hash;
  u32 *string_hash;
  uns string_buckets, string_hash_order, string_count;
  int fd_string_map;
  uns string_hash_file_size, string_map_file_size;

  /* Fingerprints */
  byte *fn_card_prints;
  struct fastbuf *fb_card_prints;
};

struct spell_pair {
  struct spell_pair *next;
  uns x, y;
};

/* reply.c */

struct reply {
  struct reply *next;
  uns len;
  byte text[1];
};

struct replybuf {
  struct reply *first, **last;
  struct mempool *pool;
};

struct query;

void init_reply_buf(struct replybuf *, struct mempool *);
void add_reply(struct replybuf *, char *, ...) __attribute__((format(printf,2,3)));
void ship_reply_buf(struct query *, struct replybuf *);
void flush_reply_buf(struct query *, struct replybuf *);
struct reply *first_reply_last(struct replybuf *);

extern struct replybuf *query_rbuf, *cache_rbuf;
void add_qr(char *, ...) __attribute__((format(printf,1,2)));
void add_qerr(char *, ...) __attribute__((format(printf,1,2)));
void add_cr(char *, ...) __attribute__((format(printf,1,2)));
void add_cerr(char *, ...) __attribute__((format(printf,1,2)));
void reply_f(char *, ...) __attribute__((format(printf,1,2)));

/* lex.c */

void lex_init(byte *);
int yylex(void);
int lookup_custom_attr(byte *);

/* parse.y */

enum expr_type {
  /*
   *  Many of the types have limited occurence:
   *	P = parsing and preprocessing
   *	I = input of analyse_query()
   *	O = output of analyse_query()
   *    A = internal to analyse_query()
   */
  EX_RESERVED,				/* ...  Undefined node type */
  EX_MATCH,				/* PI.  Match word or phrase */
  EX_OPTIONS,				/* P..  Set option defaults */
  EX_AND,				/* PIO  Boolean operators */
  EX_OR,
  EX_NOT,
  EX_ANY,				/* PIO  Logical 1 */
  EX_NONE,				/* PIO  Logical 0 */
  EX_IGNORE,				/* .A.  Non-indexed word */
  EX_REF_WORD,				/* ..O  Match word with specified index */
  EX_REF_PHRASE				/* ..O  Match phrase with specified index */
};

struct options {
  int weight;
  sbyte accent_mode;
  sbyte morphing;
  sbyte spelling;
  sbyte synonyming;
  u64 syn_expand;
};

#define OPT_DEFAULT -1
#define WEIGHT_DEFAULT 65535		/* unset */

struct expr {
  enum expr_type type;
  union {
    struct {
      struct expr *l, *r;
    } op;
    struct {
      struct options o;
      byte *word;
      uns classmap;
      byte is_string;
      sbyte sense;			/* Matching sense: 1=YES, -1=NOT, 0=MAYBE */
      struct expr *next_simple;		/* Next in simple search chain */
    } match;
    struct {
      struct expr *inside;
      struct options o;
    } options;
    struct {
      uns index;
    } ref;
  } u;
};

struct val_set {			/* Syntactic representation of integer sets */
  struct val_set *next;
  u32 min, max;
  byte *text;
};

#define ACCENT_AUTO 0			/* magic auto mode */
#define ACCENT_STRIP 1			/* strip all accents before comparing */
#define ACCENT_STRICT 2			/* compare with accents */
#define ACCENT_AUTO_LOCAL 3		/* magic auto mode done per word */

void err(byte *) NONRET;
byte *parse_query(byte *);
struct expr *new_node(enum expr_type t);
struct expr *new_op(enum expr_type t, struct expr *l, struct expr *r);
void merge_options(struct options *dest, struct options *old, struct options *new);

/* sherlockd.c */

#define IOBUF_SIZE 4096

struct query {
  cnode n;
  struct mempool *pool;			/* Memory pool for _query_ data (use results->pool for results to be cached) */

  /* Query parameters */
  u32 db_mask;				/* Database mask */
  struct options default_options;	/* Default word options */
  uns site_only;			/* Only this site */
  uns site_max;				/* Number of documents per site */
  uns allow_approx;			/* Allow approximation */
  uns partial_answers;			/* Allow partial answers */
  struct expr *expr;			/* Query expression (NULL if it's a command) */
  byte *cmd;				/* Command */

#define INT_ATTR(id,keywd,gf,pf) u32 id##_min, id##_max;
#define SMALL_SET_ATTR(id,keywd,gf,pf) u32 id##_set;
#define LATE_INT_ATTR INT_ATTR
#define LATE_SMALL_SET_ATTR SMALL_SET_ATTR
  EXTENDED_ATTRS			/* Extended attributes */
#undef INT_ATTR
#undef SMALL_SET_ATTR
#undef LATE_INT_ATTR
#undef LATE_SMALL_SET_ATTR

  CUSTOM_MATCH_VARS			/* Include what custom matchers need */

  int custom_sorting;			/* Sort on custom attribute */
  u32 custom_sort_reverse;		/* ~0 if sorting reverse, else 0 */
  int custom_sort_only;			/* should Q be ignored? */
  uns age_raw_min, age_raw_max;		/* Limits on document age */
  u32 explain_id;			/* Which object we'd like to explain for [CONFIG_EXPLAIN] */

  /* Display parameters */
  struct val_set *range;		/* Range of results to display */
  uns list_only;			/* Show only a list of results, don't show full cards */
  uns debug;				/* Debug flags */
  uns context_chars;			/* Number of context chars to print */
  uns title_chars;			/* Number of title chars to print */
  uns intervals;			/* Number of intervals to print */
  uns url_max;				/* Maximum # of URL's to print */

  /* Reply */
  struct replybuf rbuf;			/* Reply header */

  /* Connection */
  sh_time_t established;		/* Time this connection was established */
  int fd;				/* Socket file descriptor */
  int fd_err;				/* Set in case an error has occured */
  int q_status;				/* Error code returned */
  byte ipaddr[16];			/* IP address of the other end */
  byte iobuf[IOBUF_SIZE];		/* I/O buffering */
  byte *ibptr, *ibend;
  byte *obptr, *obend;

  /* Query processing status */
  struct results *results;		/* Result structure for this query */
  struct database *dbase;		/* Database we're currently examining */
  struct word *words;			/* Words matched in the query */
  uns nwords;
  struct ref_chain *first_ref, *last_ref;
  struct phrase *phrases;
  uns nphrases;
  struct phrase *nears;
  uns nnears;
  u32 *bool_map;
  u32 *optimistic_bool_map;
  uns n_bool_ids;			/* Number of words/phrases involved in boolean expression */
  uns matching_docs;			/* Matching documents for this dbase */
  EXTENDED_STAT_VARS			/* Custom statistics */
  int age_min, age_max;			/* Document age relative to ref_time of current database */
  uns contains_accents;			/* There is an accent anywhere in the query */
  int cache_age;			/* Age of cached reply, -1 if not cached */
  uns stat_num_chains;			/* Statistics: total number of reference chains seen */
  uns stat_len_chains;			/* Statistics: total length of reference chains seen */

  /* Timing statistics */
  uns time_total;
  char *profile_stats;

  /* Memory mappings */
  addr_int_t last_mapping;
};

#define CONTEXT_FULL 1000000000		/* context_chars when CONTEXT FULL is asked for */

void reply_string(struct query *, byte *, uns);

/* Internal parameters masking themselves as extended attributes */
/* Beware, they should not collide with extended attr offsets in struct query */
#define PARAM_SITE		0
#define PARAM_AGE		1
#define PARAM_CARDID		2

/* Debug flags (global_debug, query->debug, DEBUG) */

#define DEBUG_NOCACHE		1	/* Disable reply caching */
#define DEBUG_ANALYSE		2	/* Debug query analyser */
#define DEBUG_DUMPING		4	/* Dumping the context */
#define DEBUG_WORDS		8	/* Debug processing of words */
#define DEBUG_CARD_INFO		16	/* Show result notes and card attributes */

/* query.c */

extern struct query *current_query;
extern struct database *current_dbase;

/* When processing a simple search query, we represent it by a list of these structures */

struct simple {
  cnode n;
  struct expr *raw;			/* "raw" version, that is an EX_MATCH node */
  struct expr *cooked;			/* "cooked" version */
  clist phrase;				/* words.c: phrase expansion */
};

/*
 *   For each unique entity we match (words, word complexes, strings), we create
 *   a single struct word holding all the relevant information and pointing
 *   to reference chains corresponding to the expansion of this word. Multiple
 *   occurences of the same entity (that is, if all the key attributes marked
 *   with [K] are equal) get mapped to the same struct word.
 */

struct word {
  /* stuff used by refs.c goes first to improve caching */
  uns boolean_id;			/* ID used in boolean expression */
  uns type_mask;			/* [K] Bitmap of allowed word/string types, upper 16 bits are meta types */
  uns doc_count;			/* Number of documents matched */
  int weight;				/* Word weight */
  int q, q2, q2strict;			/* refs.c: Best matches recorded for the current OID */
  int pos, pos2;
  uns is_string;			/* [K] Match strings, not words */
  uns is_outer;				/* At least once outside a phrase */
  struct options options;		/* [K] Local word matching options; translate_accent_mode()'d */
  uns ref_count;			/* Number of reference chains matched */
  uns ref_total_len;			/* Total length of all reference chains */
  uns status;				/* 0 for OK or error code */
  byte *word;				/* [K] The word itself */
  uns expanded;				/* Already expanded (0=not, 1=partially, 2=with refs) */
  uns word_class;			/* words.c: Word class */
  clist variants;			/* words.c: The list of word variants */
  uns var_count;			/* words.c: The number of variants */
  uns cover_count;			/* words.c: Number of occurences inside a complex */
  uns use_count;			/* Number of times this word was looked up */
  uns hide_count;			/* If hide_count == use_count, don't report the word unless it was matched */
  uns is_wild;				/* Contains wildcards */
  struct word *root;			/* If it's a complex, then its root word, else NULL */
#ifdef CONFIG_EXPLAIN
  uns explain_ref, explain_ref2s;	/* Ref entries remembered for EXPLAINing */
#endif
};

struct phrase {				/* Used for both phrases and near-matchers */
  uns word[MAX_PHRASE_LEN];		/* Words the phrase consists of */
  uns relpos[MAX_PHRASE_LEN];		/* Relative positions of the words */
  byte word_to_idx[HARD_MAX_WORDS];	/* Maps word id to position in phrase, offset by 1 */
  byte next_same_word[MAX_PHRASE_LEN];	/* Next occurence of the same word, offset by 1 */
  u32 prox_map;				/* Where do we allow proximity */
  uns length;				/* Number of words */
  int weight;
  uns matches;
  uns boolean_id;
  u32 word_mask;
};

struct ref_chain {
  union {
    struct {
      sh_off_t start;
      u32 size;
    } file;
    struct {
      u16 *pos;
    } mem;
  } u;
  u32 lang_mask;			/* Language mask */
  byte word_index;			/* Word this ref belongs to */
  byte noaccent_only;			/* Search accentless documents only */
  byte penalty;				/* Extra negative weight applied to references from this chain */
};

struct results {			/* Query results we cache */
  cnode h;				/* In hash queue (_must_ be first!) */
  cnode n;				/* In LRU queue */
  int status;				/* -1=uninitialized, 0=OK, else error code */
  byte *request;			/* Normalized request */
  sh_time_t create_time, access_time;	/* Time created/last used */
  struct mempool *pool;			/* Pool holding results */
  struct replybuf rbuf;			/* Reply strings */
  struct result_note **result_heap;	/* Matching documents */
  uns nresults;
  struct result_note *first_note, *free_note;
  struct hilite_word *first_hilite;	/* A list of words to highlight in output */
};

struct hilite_word {
  struct hilite_word *next;
  byte w[1];
};

void cache_init(void);
void query_init(void);
void process_query(struct query *q);
int lookup_word(struct query *q, struct expr *e, byte *w);
void add_hilited_word(struct query *q, byte *w);
void eval_err(int code);

/* Profiling counters */
#define PROFILERS(sep) P(analyse) sep P(reff) sep P(refs) sep P(resf) sep P(results)
#define COMMA ,
#define P(x) prof_##x
extern prof_t PROFILERS(COMMA);
extern prof_t prof_send;
#undef P
prof_t *profiler_switch(prof_t *p);

/* dbase.c */

void db_init(void);
void db_switch_config(struct database *db);
struct database *attr_to_db(struct card_attr *attr, oid_t *ooid);

/* cards.c */

void cards_init(void);
int check_result_set(struct query *q);
void show_results(struct query *q);
extern byte *wt_names[];
extern byte *mt_names[];
extern byte *st_names[];

/* words.c */

void words_init(struct database *db);
void word_analyse_simple(struct query *q, clist *l);
int word_classify(struct database *db, byte *word);
int contains_accents(byte *s);
void word_add_hilites(struct query *q, struct word *w);
void spell_check(struct query *q);

/* strings.c */

void strings_init(struct database *db);
void string_analyse_simple(struct query *q, clist *l);

/* memory.c */

struct mmap_request {
  union {
    struct {
      int fd;
      sh_off_t start, end;
    } req;
    struct {
      void *start, *end;
    } map;
  } u;
  addr_int_t userdata;
};

void memory_init(void);
void memory_setup(struct query *q);
void memory_flush(struct query *q);
void *mmap_region(struct query *q, int fd, sh_off_t start, sh_off_t end);
int mmap_regions(struct query *q, struct mmap_request *reqs, int count);

extern struct lizard_buffer *liz_buf;

/* refs.c */

struct result_note {
  uns heap;				/* Back-link to the heap */
  struct card_attr *attr;		/* Attributes of this card */
  int q;
  u32 sec_sort_key;
  u16 best[HARD_MAX_NOTES];		/* position, 0xffff if undefined */
};

void refs_init(void);
void process_refs(struct query *q);
void query_init_refs(struct query *q);
void query_finish_refs(struct query *q);

/* cmds.c */

void do_command(struct query *q);
