/*
 *	Sherlock Gatherer -- Gathering Objects
 *
 *	(c) 2001--2004 Martin Mares <mj@ucw.cz>
 *	(c) 2004--2005 Robert Spalek <robert@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/fastbuf.h"
#include "lib/mempool.h"
#include "lib/chartype.h"
#include "lib/md5.h"
#include "lib/base224.h"
#include "sherlock/index.h"
#include "sherlock/tagged-text.h"
#include "lib/bitarray.h"
#include "sherlock/bucket.h"
#include "lib/lizard.h"
#include "lib/ff-utf8.h"
#include "gather/gather.h"

#include <string.h>
#include <stdarg.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>

struct gobject *gthis;		/* Current gatherer object we're working on */

void
gatherer_init(void)		/* Before using libgather, you need to call this function */
{
  gather_init_filter();
}

struct gobject *
gobj_new(struct mempool *pool)
{
  struct gobject *g;
  struct timeval tv;

  if (!pool)
    pool = mp_new(4096);
  g = mp_alloc_zero(pool, sizeof(struct gobject));
  g->pool = pool;
  g->aa = obj_new(pool);
  init_list(&g->ref_list);
  if (gettimeofday(&tv, NULL) < 0)
    die("gettimeofday failed: %m");
  g->start_time = tv.tv_sec;
  g->start_time_us = tv.tv_usec;
  return g;
}

void
gobj_free(struct gobject *g)
{
  bclose(g->contents);
  bclose(g->text);
  bclose(g->meta);
  bclose(g->thumbnail);
  bclose(g->temp);
  mp_delete(g->pool);
}

byte *
gstrdup(byte *s)
{
  if (!s)
    return s;
  return mp_strdup(gthis->pool, s);
}

void
gerror(int code, char *msg, ...)
{
  va_list args;
  byte buf[MAX_URL_SIZE + 1024];
  int n;

  alarm(0);
  va_start(args, msg);

  if (gthis->robot_file_p && code == 1)
    {
      /* robots.txt are a redirect which we consider to be an error */
      code = 2305;
      msg = "Robot file is a redirect";
    }

  n = vsnprintf(buf, sizeof(buf), msg, args);
  if (n >= (int) sizeof(buf) || n < 0)
    die("gerror: message too long");
  gthis->error_code = code;
  gthis->error_msg = gstrdup(buf);
  va_end(args);
  gthis->error_hook();
  die("error_hook has returned");
}

static void
gobj_calc_fb_sum(byte *md5, struct fastbuf *f)
{
  struct MD5Context m;
  byte block[4096];
  uns len;

  MD5Init(&m);
  f = fbmem_clone_read(f);
  while (len = bread(f, block, sizeof(block)))
    MD5Update(&m, block, len);
  bclose(f);
  MD5Final(md5, &m);
}

void
gobj_calc_sum(void)
{
  if (gthis->error_code || !min_summed_size || gthis->orig_size < min_summed_size)
    return;
  gobj_calc_fb_sum(gthis->MD5, gthis->contents);
  gthis->MD5_valid = 1;
}

static inline void
gobj_write_str(struct fastbuf *b, int type, byte *s)
{
  if (!s)
    return;
  bput_attr_str(b, type, s);
}

static inline void
gobj_write_num(struct fastbuf *b, int type, unsigned int n)
{
  if (!n)
    return;
  bput_attr_num(b, type, n);
}

static void
gobj_write_stream(struct fastbuf *b, int type, struct fastbuf *f)
{
  byte buf[520];
  int w = 0;
  int c;

  if (!f)
    return;
  f = fbmem_clone_read(f);
  while ((c = bgetc(f)) >= 0)
    {
      if (w > 256 && (c <= ' ' || (c >= 0x80 && c < 0xa0)) || w > 512)
	{
	  bput_attr(b, type, buf, w);
	  w = 0;
	  if (c <= ' ')
	    continue;
	}
      if (c != '\n')
	buf[w++] = c;
      else
      {
	bput_attr(b, type, buf, w);
	w = 0;
      }
      if (c >= 0xc0)
	{
	  /* Copy the whole UTF-8 character to avoid line breaks inside */
	  while (c & 0x40)
	    {
	      buf[w++] = bgetc(f);
	      c <<= 1;
	    }
	}
      else if (c >= 0xa0 && c < 0xb0)
	{
	  c = bgetc(f);
	  ASSERT(c >= 0x80);
	  buf[w++] = c;
	}
    }
  bput_attr(b, type, buf, w);
  bclose(f);
}

static void
gobj_write_base224_stream(struct fastbuf *b, int type, struct fastbuf *f)
{
  byte ib[BASE224_IN_CHUNK*6], ob[BASE224_OUT_CHUNK*6];
  uns l;

  if (!f)
    return;
  f = fbmem_clone_read(f);
  while (l = bread(f, ib, sizeof(ib)))
  {
    l = base224_encode(ob, ib, l);
    bput_attr(b, type, ob, l);
  }
  bclose(f);
}

static void
gobj_write_sum(struct fastbuf *b)
{
  byte sum[MD5_HEX_SIZE];

  if (!gthis->MD5_valid)
    return;
  md5_to_hex(gthis->MD5, sum);
  bput_attr_str(b, 'C', sum);
}

static void
gobj_write_ref(struct fastbuf *b, struct gobj_ref *ref)
{
  bput_attr_format(b, ref->type, "%s %d%s", ref->url, ref->id,
    (ref->dont_follow || gthis->dont_follow_links) ? " 1" : "");
}

static void
gobj_write_meta_stream(struct fastbuf *b, struct fastbuf *f)
{
  if (!f)
    return;
  byte buf[520];
  f = fbmem_clone_read(f);
  uns c, len = 0, cropped = 0;
  while ((c = bget_tagged_char(f)) != ~0U)
    {
      if (c >= 0x80000000)
	{
	  ASSERT(c >= 0x80000090 && c < 0x80010000);
	  if (len)
	    bput_attr(b, 'M', buf, len);
	  buf[0] = c;
	  len = 1;
	  cropped = 0;
	}
      else
	{
	  ASSERT(len);
	  if (len > 512 || cropped)
	    continue;
	  if (c <= ' ')
	    {
	      if (len > 256)
		{
		  cropped = 1;
		  continue;
		}
	      c = ' ';
	    }
	  byte *ptr = buf + len;
	  PUT_UTF8(ptr, c);
	  len = ptr - buf;
	}
    }
  if (len)
    bput_attr(b, 'M', buf, len);
  bclose(f);
}

static void
gobj_dump(struct fastbuf *b_head, struct fastbuf *b, uns bucket_type, uns flags)
{
  put_attr_set_type(bucket_type);

  gobj_write_str(b_head, 'U', gthis->url);
  bput_attr_separator(b_head);

  gobj_write_str(b, 'v', "1");
  gobj_write_num(b, 'D', gthis->start_time);
  gobj_write_num(b, 'L', gthis->lastmod_time);
  gobj_write_num(b, 'e', gthis->expires_time);
  gobj_write_str(b, 'E', gthis->content_encoding);
  gobj_write_str(b, 'T', gthis->content_type);
  gobj_write_str(b, 'S', gthis->http_server);
  gobj_write_str(b, 'g', gthis->etag);
  gobj_write_str(b, 'c', gthis->charset);
  gobj_write_str(b, 'l', gthis->language);
  gobj_write_num(b, 's', gthis->orig_size);
  struct timeval tv;
  if (gettimeofday(&tv, NULL) < 0)
    die("gettimeofday failed: %m");
  gobj_write_num(b, 'h', 1000*(tv.tv_sec - gthis->start_time) + ((int)(tv.tv_usec - gthis->start_time_us)) / 1000);
  if (gthis->truncated)
    gobj_write_str(b, '.', "Truncated");
  gobj_write_sum(b);
  struct gobj_ref *ref;
  WALK_LIST(ref, gthis->ref_list)
    gobj_write_ref(b, ref);
  obj_write(b, gthis->aa);
  gobj_write_meta_stream(b, gthis->meta);
  if ((flags & GWF_DUMP_SOURCE) && gthis->content_type && !strncasecmp(gthis->content_type, "text/", 5))
    gobj_write_stream(b, 'Z', gthis->contents);
  if ((flags & GWF_DUMP_BODY) && !gthis->dont_save_contents)
    {
      gobj_write_stream(b, 'X', gthis->text);
      gobj_write_base224_stream(b, 'N', gthis->thumbnail);
    }
  bput_attr_format(b, '!', "%04d %s", gthis->error_code, gthis->error_msg ?: (byte *)"OK");
}

static uns
safe_strlen(char *c)
{
  return c ? strlen(c) : 0;
}

static uns
gobj_stream_size(struct fastbuf *b)
{
  /*
   *  Trick: bfilesize() cannot be used there since the streams are write parts of fbmem's
   *  which are not seekable. However, we can be sure that they are positioned just after
   *  the last byte written, hence btell() does the job.
   */
  return (b ? btell(b) : 0);
}

static uns
gobj_estimate_length(uns flags)
{
  uns l = strlen(gthis->url) + safe_strlen(gthis->http_server) + safe_strlen(gthis->etag) + 4096;
  struct gobj_ref *ref;
  WALK_LIST(ref, gthis->ref_list)
    l += strlen(ref->url) + 16;
  l += gobj_stream_size(gthis->meta) * 1.1;
  if (flags & GWF_DUMP_SOURCE)
    l += gobj_stream_size(gthis->contents) * 1.1;
  if (flags & GWF_DUMP_BODY)
    l += gobj_stream_size(gthis->text) * 1.1 + gobj_stream_size(gthis->thumbnail) * 1.2;
  return l;
}

static uns
gobj_write_compressed(struct fastbuf *b, uns bucket_type, uns flags)
{
  if (gthis->thumbnail)				// do not try to compress images
  {
    bucket_type = BUCKET_TYPE_V33;
    gobj_dump(b, b, bucket_type, flags);
    return bucket_type;
  }

  uns len_in, block_in, est_len, est_out, block_out, len_out;
  byte *buf_in, *buf_out;
  struct fastbuf *w_body, *b_body;

  est_len = gobj_estimate_length(flags);
  w_body = fbmem_create(est_len);		// should be long enough to fit in just 1 block, however not necessary

  gobj_dump(b, w_body, BUCKET_TYPE_V33, flags);

  bflush(w_body);
  len_in = btell(w_body);
  b_body = fbmem_clone_read(w_body);
  bclose(w_body);

  block_in = bdirect_read_prepare(b_body, &buf_in);
  if (block_in < len_in)			// cannot use zero-copy input
    {
      log(L_WARN, "gobj_write_compressed: Wrong estimate of body size: %d > %d on URL %s, cannot zero-copy", len_in, block_in, gthis->url);
      buf_in = xmalloc(len_in);
      bread(b_body, buf_in, len_in);
    }
  uns adler = adler32(buf_in, len_in);

  est_out = len_in * LIZARD_MAX_MULTIPLY + LIZARD_MAX_ADD + 8;
  block_out = bdirect_write_prepare(b, &buf_out);
  if (est_out > block_out)
    {
      buf_out = xmalloc(est_out);
      len_out = lizard_compress(buf_in, len_in, buf_out);
      if (len_out >= len_in * (gather_min_compression / 100.))
	{
	  bucket_type = BUCKET_TYPE_V33;
	  bwrite(b, buf_in, len_in);
	}
      else
	{
	  bputl(b, len_in);
	  bputl(b, adler);
	  bwrite(b, buf_out, len_out);
	}
      xfree(buf_out);
    }
  else						// zero-copy output
    {
      PUT_U32(buf_out, len_in);
      PUT_U32(buf_out+4, adler);
      len_out = lizard_compress(buf_in, len_in, buf_out+8) + 8;
      if (len_out >= len_in * (gather_min_compression / 100.))
	{
	  bucket_type = BUCKET_TYPE_V33;
	  bwrite(b, buf_in, len_in);
	}
      else
	bdirect_write_commit(b, buf_out + len_out);
    }

  if (block_in < len_in)
    xfree(buf_in);
  bclose(b_body);				// bdirect_read_commit() not needed

  return bucket_type;
}

uns
gobj_write(struct fastbuf *b, uns bucket_type, uns flags)
{
  ASSERT(b);
  if (bucket_type == BUCKET_TYPE_V33_LIZARD && gather_min_compression)
    return gobj_write_compressed(b, bucket_type, flags);
  else
    {
      if (bucket_type == BUCKET_TYPE_V33_LIZARD)
	bucket_type = BUCKET_TYPE_V33;
      gobj_dump(b, b, bucket_type, flags);
      return bucket_type;
    }
}

byte *
gobj_parse_url(struct url *url, byte *u, byte *msg, uns allow_rel)
{
  int e;
  byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
  struct url ur;

  if ((e = url_deescape(u, buf1)) ||
      (e = url_split(buf1, &ur, buf2)))
    goto urlerr;
  if (e = url_normalize(&ur, NULL))
    {
      if (e != URL_ERR_REL_NOTHING)
	goto urlerr;
      if (!gthis->url && !gthis->base_url)	/* We really have no base */
	goto urlerr;
      if (!allow_rel && log_base_errors)
	log(L_ERROR_R, "Relative %s URL encountered: %s", msg, u);
      if (gthis->base_url)
	e = url_normalize(&ur, &gthis->base_url_s);
      else
	e = url_normalize(&ur, &gthis->url_s);
      if (e)
	goto urlerr;
    }
  if ((e = url_canonicalize(&ur)) ||
      (e = url_pack(&ur, buf3)) ||
      (e = url_enescape(buf3, buf4)))
    goto urlerr;
  ur.protocol = gstrdup(ur.protocol);
  ur.user = gstrdup(ur.user);
  ur.pass = gstrdup(ur.pass);
  ur.host = gstrdup(ur.host);
  ur.rest = gstrdup(ur.rest);
  *url = ur;	/* We need a local copy as we might have used the same URL as a base */
  return gstrdup(buf4);

 urlerr:
  gerror(2000+e, "Error parsing %s URL %s: %s", msg, u, url_error(e));
}

struct url *
gobj_base_url(void)
{
  if (gthis->base_url)
    return &gthis->base_url_s;
  else
    return &gthis->url_s;
}

struct gobj_ref *
gobj_add_ref_full(int type, byte *url, byte *ctype, struct url *base)
{
  byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
  int e;
  struct url u;
  struct gobj_ref *r;

  if (!url)
    return NULL;
  if (!base)
    base = gobj_base_url();
  if ((e = url_canon_split_rel(url, buf1, buf2, &u, base)) ||
      (e = url_pack(&u, buf3)) ||
      (e = url_enescape(buf3, buf4)))
    {
      if (log_ref_errors)
	log(L_WARN_R, "Invalid ref to %s: %s", url, url_error(e));
      return NULL;
    }
  url = buf4;

  if (!ctype)
    {
      byte *cenc = NULL;
      guess_content_by_name(u.rest, &ctype, &cenc);
      if (!ctype)
	ctype = "";
    }
  else
    {
      byte *x = ctype;
      ctype = mp_alloc(gthis->pool, strlen(x)+1);
      strcpy(ctype, x);
    }

  WALK_LIST(r, gthis->ref_list)
    {
      /* FIXME: This is quadratic. */
      if (r->type == type && !strcmp(r->url, url) && !strcmp(r->content_type, ctype))
	return r;
    }

  r = mp_alloc(gthis->pool, sizeof(struct gobj_ref) + strlen(url));
  r->type = type;
  r->content_type = ctype;
  r->id = gthis->ref_count++;
  r->dont_follow = 0;
  strcpy(r->url, url);
  add_tail(&gthis->ref_list, &r->n);
  return r;
}

struct gobj_ref *
gobj_add_ref(int type, byte *url)
{
  return gobj_add_ref_full(type, url, NULL, NULL);
}

static uns
gobj_diff_text(void)
{
  byte *old_md5_txt = obj_find_aval(gthis->refreshing, 'C');
  byte old_md5[MD5_SIZE];

  if (!old_md5_txt || !gthis->MD5_valid)
    return GOBJ_CHG_TEXT_LARGE;
  hex_to_md5(old_md5_txt, old_md5);
  return memcmp(gthis->MD5, old_md5, MD5_SIZE) ? GOBJ_CHG_TEXT_LARGE : 0;
}

static uns
gobj_diff_refs(void)
{
  BIT_ARRAY(types_used, 128);
  struct gobj_ref *ref, *r;
  struct oattr *attr;
  byte url[MAX_URL_SIZE];

  /* Only newly added references are detected */
  bit_array_zero(types_used, 128);
  WALK_LIST(ref, gthis->ref_list)
    if (!bit_array_test_and_set(types_used, ref->type))
      {
	attr = obj_find_attr(gthis->refreshing, ref->type);
	for (r=ref; r->n.next; r=(struct gobj_ref *)r->n.next)
	  {
	    if (r->type != ref->type)
	      continue;
	    if (!attr)
	      url[0] = 0;
	    else
	      {
		byte *u = attr->val;
		byte *stop = (byte*)strchr(u, ' ') ? : (u + strlen(u));
		ASSERT(stop-u < MAX_URL_SIZE);
		memcpy(url, u, stop-u);
		url[stop-u] = 0;
	      }
	    if (strcmp(url, r->url))
	      return GOBJ_CHG_REFS;
	    attr = attr->same;
	  }
      }
  return 0;
}

static uns
gobj_diff_http(void)
{
  /* ETag */
  byte *ex = gthis->etag ? : (byte *)"";
  byte *ey = obj_find_aval(gthis->refreshing, 'g') ? : (byte *)"";
  if (strcmp(ex, ey))
    return GOBJ_CHG_HTTP;

  /* Last Modified */
  byte *ly;
  sh_time_t lm;
  if (ly = obj_find_aval(gthis->refreshing, 'L'))
    lm = atol(ly);
  else
    lm = 0;
  if (gthis->lastmod_time != lm)
    return GOBJ_CHG_HTTP;

  return 0;
}

static uns
gobj_diff(void)
{
  if (!gthis->refreshing)
    return ~0;

  int diff = 0;
  diff |= gobj_diff_text();
  diff |= gobj_diff_refs();
  diff |= gobj_diff_http();

  byte *x = obj_find_aval(gthis->refreshing, 'D');
  if (x && gthis->start_time - (sh_time_t)atol(x) > max_refresh_age)
    diff |= GOBJ_CHG_FORCED;

  return diff;
}

uns
gobj_check_update(void)
{
  struct odes *old = gthis->refreshing;
  byte *x;

  uns diff = gobj_diff();

  if (diff & GOBJ_CHG_TEXT_LARGE)
    {
      /* Big update: new "J" and save the old one to "p" */
      obj_set_attr_num(gthis->aa, 'J', gthis->start_time);
      if (old && (x = obj_find_aval(old, 'J')))
	obj_set_attr(gthis->aa, 'p', x);
    }
  else
    {
      /* Small or no update: inherit "J" and "p" from the previous version */
      if (old)
	{
	  obj_set_attr(gthis->aa, 'J', obj_find_aval(old, 'J'));
	  obj_set_attr(gthis->aa, 'p', obj_find_aval(old, 'p'));
	}
    }

  return diff;
}

void
gobj_truncate(void)
{
  gthis->truncated = 1;
  if (!allow_truncate)
    gerror(2405, "Object too large");
}
