/*
 *	Sherlock Gatherer: Document Format Multiplexer
 *
 *	(c) 2001 Martin Mares <mj@ucw.cz>
 *
 *	The multiplexer is split to two parts: format.c and parse.c
 *	to avoid linking of all the parsing machinery where only the
 *	list of known content types and encodings is needed.
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "gather/gather.h"

#include <string.h>

#define TRACE(x,y...) do { if (trace_parse) log(L_DEBUG, x,##y); } while (0)

/*** Known parser functions (remember to keep in sync with format.c) ***/

static int (*parser_functions[])(void) = {
  sink_parse,
  text_parse,
  html_parse,
  gzip_parse,
  deflate_parse,
  compress_parse,
  robots_parse,
  external_parse,
#ifdef CONFIG_PDF
  pdf_parse,
#endif
};

/*** The parsing multiplexer ***/

void
parse(void)
{
  int convcnt = 0;
  int parser;
  byte *t;

  if (gthis->robot_file_p)
    gthis->content_type = "x-sherlock/robots";
  byte *orig_content_type = NULL;
  do
    {
      gather_filter();
      if (t = gthis->content_encoding)
	{
	  parser = identify_content_encoding(gthis->content_encoding);
	  if (parser >= 0)
	    {
	      TRACE("Parsing content-encoding %s by %s", t, parser_names[parser]);
	      cut_inenc_suffix(gthis->url_s.rest, t);
	      goto work;
	    }
	  gerror(2403, "Unknown content encoding %s", t);
	}
      if (!(t = gthis->content_type))
	gerror(2400, "Document has no content type");
      if (gthis->robot_file_p && strcmp(gthis->content_type, "x-sherlock/robots"))
	gerror(2400, "robots.txt has invalid content-type %s", gthis->content_type);
      if (!orig_content_type)
	orig_content_type = gthis->content_type;
      parser = identify_content_type(gthis->content_type);
      if (parser >= 0)
	{
	  TRACE("Parsing content-type %s by %s", t, parser_names[parser]);
	  goto work;
	}
      gerror(2400, "Unknown content type %s", t);
    work:
      if (parser_functions[parser]())
	{
	  validate_document();
	  gobj_calc_sum();
	  gthis->content_type = orig_content_type;
	  return;
	}
      convcnt++;
    }
  while (!max_conversions || convcnt <= max_conversions);
  gerror(2402, "Too many conversions");
}

/* Internal procedure used inside all decompressors --- it replaces the
 * original stream by the decompressed one */

void
switch_content_encoding(void)
{
	obj_add_attr(gthis->aa, 'E', gthis->content_encoding);
	gthis->content_encoding = NULL;
	bclose(gthis->contents);
	gthis->contents = gthis->temp;
	gthis->temp = NULL;
}
