/*
 *	Sherlock Gatherer: Document Format Multiplexer
 *
 *	(c) 2001--2002 Martin Mares <mj@ucw.cz>
 *
 *	The multiplexer is split to two parts: format.c and parse.c
 *	to avoid linking of all the parsing machinery where only the
 *	list of known content types and encodings is needed.
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "gather/gather.h"

#include <string.h>

/*** Known parser names (remember to update parse.c) ***/

byte *parser_names[] = {
  "sink",
  "text",
  "html",				/* Caveat: position of this one is hard-coded in identify_*() */
  "gzip",
  "deflate",
  "compress",
  "robots",
  "external",
#ifdef CONFIG_PDF
  "pdf",
#endif
};

/*** Configuration ***/

int trace_parse, max_conversions;

struct parser_hook {
  node n;
  byte *type_patt;
  int parser;
};

static list type_hooks, encoding_hooks;

byte *
parse_add_hook(struct cfitem *item, byte *value)
{
  list *l = (item->name[0] == 'T') ? &type_hooks : &encoding_hooks;
  byte *w[2];
  struct parser_hook *hook = cfg_malloc(sizeof(struct parser_hook));
  uns i;

  if (wordsplit(value, w, 2) != 2)
    return "Expecting type mask and parser name";
  for (i=0; i<ARRAY_SIZE(parser_names); i++)
    if (!strcmp(parser_names[i], w[1]))
      {
	add_tail(l, &hook->n);
	hook->type_patt = w[0];
	hook->parser = i;
	return NULL;
      }
  return "Unknown parser";
}

static struct cfitem parse_config[] = {
  { "Parse",		CT_SECTION,	NULL },
  { "Trace",		CT_INT,		&trace_parse },
  { "MaxConversions",	CT_INT,		&max_conversions },
  { "Type",		CT_FUNCTION,	parse_add_hook },
  { "Encoding",		CT_FUNCTION,	parse_add_hook },
  { NULL,		CT_STOP,	NULL }
};

static void CONSTRUCTOR parse_init_config(void)
{
  cf_register(parse_config);
  init_list(&type_hooks);
  init_list(&encoding_hooks);
}

/*** Parsing ***/

int
identify_content_type(byte *type)
{
  struct parser_hook *hook;

  if (!type)
    return 1;
  WALK_LIST(hook, type_hooks)
    if (match_ct_patt(hook->type_patt, type))
      return hook->parser;
  return -1;
}

int
identify_content_encoding(byte *enc)
{
  struct parser_hook *hook;

  if (!enc)
    return 1;
  WALK_LIST(hook, encoding_hooks)
    if (!strcasecmp(hook->type_patt, enc))
      return hook->parser;
  return -1;
}
