/*
 *	Sherlock Indexer -- Initial Object Scanning
 *
 *	(c) 2001--2006 Martin Mares <mj@ucw.cz>
 *	(c) 2002--2005 Robert Spalek <robert@ucw.cz>
 *	(c) 2006 Pavel Charvat <pchar@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/getopt.h"
#include "lib/fastbuf.h"
#include "lib/md5.h"
#include "lib/mempool.h"
#include "lib/url.h"
#include "lib/bitarray.h"
#include "lib/math.h"
#include "sherlock/object.h"
#include "sherlock/attrset.h"
#include "sherlock/tagged-text.h"
#include "lib/unicode.h"
#include "lib/hashfunc.h"
#include "charset/unicat.h"
#include "indexer/indexer.h"
#include "indexer/matcher.h"
#include "indexer/params.h"
#include "filter/filter.h"
#include "analyser/analyser.h"
#include "lang/lang.h"
#include "lang/detect.h"

#include <string.h>
#include <stdlib.h>
#include <fcntl.h>
#include <time.h>

static struct mempool *scanner_pool;
static struct index_params parameters;

struct scan_filter_data {
  int bonus, card_bonus, area;
  byte *url;
  struct url url_s;
  uns queue_key;
  byte *language;
  byte *title;
  byte *site_name;
  int site_level;
  int image_size;
  int image_aspect_ratio;
  int image_colors;
};
static struct filter_args *scan_filter_args;
static struct scan_filter_data scan_filter_data;

struct filter_binding scan_bindings[] = {
  /* URL and its parts */
  { "url",		OFFSETOF(struct scan_filter_data, url) },
  { "protocol",		OFFSETOF(struct scan_filter_data, url_s.protocol) },
  { "host",		OFFSETOF(struct scan_filter_data, url_s.host) },
  { "port",		OFFSETOF(struct scan_filter_data, url_s.port) },
  { "path",		OFFSETOF(struct scan_filter_data, url_s.rest) },
  { "username",		OFFSETOF(struct scan_filter_data, url_s.user) },
  { "password",		OFFSETOF(struct scan_filter_data, url_s.pass) },
  /* Gatherer attributes */
  { "queue_key",	OFFSETOF(struct scan_filter_data, queue_key) },
  /* Attributes */
  { "bonus",		OFFSETOF(struct scan_filter_data, bonus) },
  { "card_bonus",	OFFSETOF(struct scan_filter_data, card_bonus) },
  { "site",		OFFSETOF(struct scan_filter_data, site_name) },
  { "site_level",	OFFSETOF(struct scan_filter_data, site_level) },
#ifdef CONFIG_LANG
  { "language",		OFFSETOF(struct scan_filter_data, language) },
#endif
  { "title",		OFFSETOF(struct scan_filter_data, title) },
  { "image_size",	OFFSETOF(struct scan_filter_data, image_size) },
  { "image_aspect_ratio",	OFFSETOF(struct scan_filter_data, image_aspect_ratio) },
  { "image_colors",	OFFSETOF(struct scan_filter_data, image_colors) },
  { NULL,		0 }
};

static void
scan_filter_init(void)
{
  if (!indexer_filter_name || !indexer_filter_name[0])
    return;
  scan_filter_args = filter_intr_new(filter_load(indexer_filter_name, filter_builtin_vars, scan_bindings, NULL));
}

static uns
get_queue_key(struct odes *obj)
{
  struct oattr *a = obj_find_attr(obj, 'k');;
  if (!a)
    return F_UNDEF_INT;
  uns v;
  sscanf(a->val, "%x", &v);
  return v;
}

static byte *
find_title(struct odes *obj UNUSED, uns raw UNUSED)
{
#ifdef MT_TITLE
  for (struct oattr *t = obj_find_attr(obj, 'M'); t; t=t->same)
    {
      byte *v = t->val;
      if (*v >= '0' && *v <= '9')
	v++;
      if (*v == 0x90 + MT_TITLE)
	if (raw)
	  return t->val;
	else
	  return v+1;
    }
#endif
  return NULL;
}

static void
get_image_attrs(struct scan_filter_data *d, struct odes *obj)
{
  struct oattr *attrs = obj_find_attr(obj, 'G');
  if (attrs)
    {
      uns width, height, ncolors;
      byte colorspace[MAX_ATTR_SIZE];
      sscanf(attrs->val, "%d%d%s%d", &width, &height, colorspace, &ncolors);

      /* should be safe because of gatherer's limits */
      d->image_size = sqrtf(width * height);
      d->image_aspect_ratio = (width > height) ? (width << 10) / height : (height << 10) / width;

      if (!strcasecmp(colorspace, "GRAY"))
	d->image_colors = 0;
      else
	d->image_colors = ncolors;
    }
  else
    d->image_size = d->image_aspect_ratio = d->image_colors = F_UNDEF_INT;
}

static int
scan_filter(byte *url, struct odes *obj)
{
  struct filter_args *a = scan_filter_args;
  struct scan_filter_data *d = &scan_filter_data;
  byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE];

  d->site_name = NULL;
  d->site_level = 0;
  d->bonus = 0;
  d->card_bonus = 0;
  d->url = url;
  if (url_canon_split(d->url, buf1, buf2, &d->url_s))
    die("scan_filter: error parsing URL");
  d->queue_key = get_queue_key(obj);
  if (!obj_find_aval(obj, 'X') && !obj_find_aval(obj, 'M'))
    d->language = NULL;
  else
    d->language = an_lang_decide_language(obj);
  d->title = NULL;
  get_image_attrs(d, obj);
  d->area = 0;
  if (!a)
    return 1;
  d->title = find_title(obj, 0);
  a->attr = obj;
  a->raw = d;
  a->pool = scanner_pool;
  return filter_intr_run(a);
}

static struct fastbuf *scan_fb_text, *scan_fb_metas;

static void
scan_analyse_init(void)
{
  analyser_init(AN_HOOK_SCANNER, AN_NEED_TEXT | AN_NEED_METAS | AN_NEED_ALL_URLS);
  if (an_hook_need_mask & AN_NEED_TEXT)
    scan_fb_text = fbgrow_create(4096);
  if (an_hook_need_mask & AN_NEED_METAS)
    scan_fb_metas = fbgrow_create(4096);
}

static void
scan_analyse(struct odes *o)
{
  struct an_iface ai = {
    .obj = o,
    .url_block = o,
    .all_urls = (struct odes *[]) { o, NULL },
  };

  uns need = analyser_need(&ai);
  if (need)
    {
      if (need & AN_NEED_TEXT)
	{
	  fbgrow_reset(scan_fb_text);
	  for (struct oattr *a = obj_find_attr(o, 'X'); a; a=a->same)
	    {
	      bwrite(scan_fb_text, a->val, str_len(a->val));	// bputs(), but with str_len()
	      if (a->same)
		bputc(scan_fb_text, ' ');
	    }
	  fbgrow_rewind(scan_fb_text);
	  ai.text = scan_fb_text;
	}
      if (need & AN_NEED_METAS)
	{
	  fbgrow_reset(scan_fb_metas);
	  for (struct oattr *a = obj_find_attr(o, 'M'); a; a=a->same)
	    bwrite(scan_fb_metas, a->val, str_len(a->val));
	  fbgrow_rewind(scan_fb_metas);
	  ai.metas = scan_fb_metas;
	}
      analyser_run_needed(&ai);
    }
}

static void
scan_analyse_end(void)
{
  analyser_log_stats();
  bclose(scan_fb_text);
  bclose(scan_fb_metas);
}

static void
prepare_parameters(void)
{
  parameters.ref_time = time(NULL);
  parameters.database_version = parameters.ref_time;
}

static void
gen_fingerprint(struct fastbuf *b, byte *url, uns id, struct card_attr *ca UNUSED)
{
  struct card_print fp;

  url_fingerprint(url, &fp.fp);
  fp.cardid = id;
  bwrite(b, &fp, sizeof(fp));
#ifdef CONFIG_MERGING_HASHES
  if (ca)
    memcpy(ca->merging_hash, &fp.fp, SHERLOCK_MERGING_HASH_SIZE);
#endif
}

static void
gen_labels_by_id(struct fastbuf *b, struct odes *o, byte *url, uns id, struct card_note *note UNUSED)
{
  struct oattr *a, *v;
  uns overrides = 0;

  bputl(b, id);
  bputc(b, LABEL_TYPE_URL | LABEL_FLAG_MERGED_ONLY);
  bput_attr_str(b, 'U', url);
  for (a=o->attrs; v=a; a=a->next)
    {
      if (attr_set_match(&label_attr_set, a))
	bput_oattr(b, a);
      if (attr_set_match(&override_label_attr_set, a))
	overrides |= 1;
      if (attr_set_match(&override_body_attr_set, a))
	overrides |= 2;
    }
  bput_attr_separator(b);

  for (uns i=0; i<2; i++)
    if (overrides & (1 << i))
      {
	bputl(b, id);
	bputc(b, (i ? LABEL_TYPE_BODY : LABEL_TYPE_URL) | LABEL_FLAG_OVERRIDE);
	for (a=o->attrs; v=a; a=a->next)
	  if (attr_set_match((i ? &override_body_attr_set : &override_label_attr_set), a))
	    bput_oattr(b, a);
	bput_attr_separator(b);
      }

  if (frameset_to_redir && obj_find_attr(o, 'F'))
    {
      bputl(b, id);
      bputc(b, LABEL_TYPE_URL);
      byte *title = find_title(o, 1);
      if (title)
	bput_attr_str(b, 'M', title);
      bput_attr_str(b, '.', "frameset");
      bput_attr_separator(b);
    }

}

static void
gen_checksums(struct fastbuf *b, struct odes *o, uns id, struct card_attr *attr, struct card_note *note)
{
  struct csum csum = { .cardid = id };
  word buf[4096];
  uns cnt = 0;
  struct oattr *a;
  struct MD5Context ct;
  uns accents = 0;
  uns chars = 0;
  uns lastc = ' ';

  MD5Init(&ct);
  for (byte *aname="XM"; *aname; aname++)
    for (a=obj_find_attr(o, *aname); a; a=a->same)
      {
	byte *z = a->val;
	uns x, c, u;
	do
	  {
	    GET_TAGGED_CHAR(z, x);
	    if (x >= 0x80000000 || !x)
	      {
		if (x >= 0x80010000)
		  continue;
		c = ' ';
	      }
	    else
	      c = x;
	    if (c != lastc || c != ' ')
	      {
		lastc = c;
		u = Uunaccent(c);
		if (c != u)
		  accents++;
		if (Ualnum(c))
		  chars++;
		buf[cnt++] = u;
		if (cnt >= ARRAY_SIZE(buf))
		  {
		    MD5Update(&ct, (byte *) buf, cnt * sizeof(buf[0]));
		    cnt = 0;
		  }
	      }
	  }
	while (x);
      }

  note->useful_size = chars;

  if (a = obj_find_attr(o, 'N'))
    {
      note->flags |= CARD_NOTE_IMAGE;
      for (; a; a=a->same)
	{
	  byte *z = a->val;
	  uns len = str_len(z);
	  MD5Update(&ct, z, len);
	  chars += len;
	}
    }

  for (a = obj_find_attr(o, 'A'); a; a=a->same)
    {
      byte *z = a->val;
      uns len = str_len(z);
      MD5Update(&ct, z, len + 1);
      /* chars += len; */
    }

  if (accents > chars/128)
    attr->flags |= CARD_FLAG_ACCENTED;
  if (!chars)
    attr->flags |= CARD_FLAG_EMPTY;
  if (chars <= min_summed_size)
    return;

  MD5Update(&ct, (byte *) buf, cnt * sizeof(buf[0]));

  MD5Final(csum.md5, &ct);
  bwrite(b, &csum, sizeof(csum));
}

static void
gen_links(struct fastbuf *b, struct odes *o, uns id, struct card_note *note)
{
  for (struct oattr *a=o->attrs; a; a=a->next)
    if (attr_set_match(&link_attr_set, a))
      for (struct oattr *v=a; v; v=v->same)
	{
	  note->flags |= CARD_NOTE_HAS_LINKS;
	  struct fingerprint fp;
	  byte *c;
	  u32 type = (a->attr == 'Y') ? ETYPE_REDIRECT :
	             (a->attr == 'F') ? ETYPE_FRAME :
	             (a->attr == 'I') ? ETYPE_IMAGE :
	             ETYPE_NORMAL;
	  if (c = strchr(v->val, ' '))
	    {
	      byte *d = c+1;
	      while (*d && *d != ' ')
		d++;
	      if (*d && d[1] == '1')	/* nofollow links */
		continue;
	      *c = 0;
	    }
	  url_fingerprint(v->val, &fp);
	  if (c)
	    *c = ' ';
	  bwrite(b, &fp, sizeof(fp));
	  bputl(b, id | type);
	}
}

#if defined(CONFIG_LANG) && defined(CONFIG_FILETYPE)
static void
gen_lang(struct card_attr *ca, struct scanner_note *incr)
{
  if (!FILETYPE_IS_TEXT(CA_GET_FILE_TYPE(ca)))
    return;
  if (incr)
    {
      ca->type_flags = incr->type_flags;
      return;
    }
  byte *lang = scan_filter_data.language;
  int id = lang ? lang_primary_language(lang) : -1;
  if (id >= 0)
    ca->type_flags |= id;
}
#else
static void gen_lang(struct card_attr *ca UNUSED, struct scanner_note *incr UNUSED) { }
#endif

static inline byte
initial_weight(struct odes *o)
{
  byte *w;
  uns initial_weight = default_weight;
  if (w = obj_find_aval(o, 'w'))
    initial_weight = atoi(w);
  return CLAMP(initial_weight + scan_filter_data.bonus, 0, 255);
}

static void
gen_attrs(struct odes *o, struct card_attr *ca, struct card_note *cn, struct scanner_note *incr)
{
  ca->weight = initial_weight(o);
  cn->weight_scanner = ca->weight;
  cn->card_bonus = CLAMP(scan_filter_data.card_bonus, -32767, 32767);

  if (frameset_to_redir && obj_find_attr(o, 'F'))
    ca->flags |= CARD_FLAG_FRAMESET | CARD_FLAG_EMPTY;

#ifdef CONFIG_LASTMOD
  byte *lm;
  int age = -1;
  if ((lm = obj_find_aval(o, 'L')) || (lm = obj_find_aval(o, 'D')))
    age = convert_age(atol(lm), parameters.ref_time);
  ca->age = MAX(0, age);
#endif

  custom_create_attrs(o, ca);
  gen_lang(ca, incr);			/* Needs type_flags set by custom_create_attrs */
}

static void
gen_signatures(struct fastbuf *signatures, struct odes *o, struct hash_permuter *permuter, uns id)
{
  struct oattr *oa;
  u32 sign[matcher_signatures];
  uns words;

  oa = obj_find_attr(o, 'X');
  if (!oa)
    return;
  words = matcher_compute_minima(sign, permuter, oa);
  if (words < matcher_min_words)
    return;
  bwrite(signatures, &id, sizeof(uns));
  bwrite(signatures, sign, sizeof(sign));
}

#ifndef MT_EXT

static void gen_ref_texts(struct fastbuf *reftexts UNUSED, struct odes *o UNUSED, uns id UNUSED) { }

#else

static int
parse_reference(byte *aval, byte *url)
{
  byte *x = aval;
  byte *y = url;
  while (*x && *x != ' ')
    *y++ = *x++;
  *y = 0;
  return *x++ ? atol(x) : -1;
}

static void
gen_ref_texts(struct fastbuf *reftexts, struct odes *o, uns id)
{
#define MAX_REFS 1024
#define MAX_REF_DEPTH 3

  /* Map of references */
  byte ref_word_types[MAX_REFS];
  struct fingerprint fp[MAX_REFS];
  uns last_ref = 0;

  /* Currently open references */
  uns refstack[MAX_REF_DEPTH];
  int refsp = -1;
  static byte *rtext[MAX_REF_DEPTH];
  uns over[MAX_REF_DEPTH], nalpha[MAX_REF_DEPTH];
  byte *rthis[MAX_REF_DEPTH], *rlimit[MAX_REF_DEPTH];

  byte *b, *c, *l;
  struct oattr *aa, *a;
  byte *wstart, *wend;
  uns nalphas;

  /* Find all references we're interested in */
  for (aa=o->attrs; aa; aa=aa->next)
    if (attr_set_match(&ref_link_attr_set, aa))
      for (a=aa; a; a=a->same)
	{
	  byte url[MAX_URL_SIZE];
	  int rid = parse_reference(a->val, url);
	  if (rid < 0 || rid >= MAX_REFS)
	    continue;
	  DBG("Ref <%s> id %d", url, rid);
	  while (last_ref <= (uns) rid)
	    ref_word_types[last_ref++] = 0;
	  ref_word_types[rid] = MT_EXT;
	  url_fingerprint(url, &fp[rid]);
	}

  /* And scan the text for bracketed parts */
  for (a=obj_find_attr(o, 'X'); a; a=a->same)
    {
      byte *z = a->val;
      uns x;
      wstart = wend = z;
      nalphas = 0;
      do
	{
	  GET_TAGGED_CHAR(z, x);
	  if (x < 0x80000000 && x != ' ')
	    {
	      if (Ualnum(x))
		nalphas++;
	      wend = z;
	      continue;
	    }
	  if (refsp >= 0 && wstart < wend && !over[refsp])
	    {
	      uns len = wend - wstart;
	      if (rthis[refsp] + len > rlimit[refsp])
		over[refsp] = 1;
	      else
		{
		  memcpy(rthis[refsp], wstart, len);
		  rthis[refsp] += len;
		  nalpha[refsp] += nalphas;
		}
	    }
	  wstart = wend = z;
	  nalphas = 0;
	  if (x < 0x80010000)		/* Word break */
	    {
	      if (refsp >= 0 && rthis[refsp][-1] != ' ')
		*rthis[refsp]++ = ' ';
	      /* We know it will fit in the buffer */
	    }
	  else if (x < 0x80020000)	/* Open */
	    {
	      refsp++;
	      ASSERT(refsp < MAX_REF_DEPTH);
	      refstack[refsp] = x & 0xffff;
	      if (!rtext[refsp])
		{
		  rtext[refsp] = xmalloc(ref_max_length + 4) + 1;
		  rtext[refsp][-1] = 0;
		}
	      rthis[refsp] = rtext[refsp];
	      rlimit[refsp] = rtext[refsp] + ref_max_length;
	      over[refsp] = 0;
	      nalpha[refsp] = 0;
	    }
	  else				/* Close */
	    {
	      uns rid;
	      ASSERT(refsp >= 0);
	      rid = refstack[refsp];
	      *rthis[refsp] = 0;
	      if (rid < last_ref && ref_word_types[rid])
		{
		  b = rtext[refsp];
		  if (*b == ' ')
		    b++;
		  c = rthis[refsp];
		  if (c > b && c[-1] == ' ')
		    c--;
		  if (c > b && nalpha[refsp] >= ref_min_length)
		    {
		      bputl(reftexts, id);
		      bwrite(reftexts, &fp[rid], sizeof(struct fingerprint));
		      bputw(reftexts, c-b+1);
		      bputc(reftexts, 0x90 + ref_word_types[rid]);
		      bwrite(reftexts, b, c-b);
		    }
		}
	      if (refsp > 0 && !over[refsp-1])
		{
		  byte *cut;
		  b = rtext[refsp];
		  c = rthis[refsp-1];
		  l = rlimit[refsp-1];
		  if (b[0] == ' ')
		    {
		      if (c[-1] != ' ')
			*c++ = ' ';
		      b++;
		    }
		  cut = c;
		  while (*b && c <= l)
		    {
		      if (*b == ' ')
			cut = c;
		      *c++ = *b++;
		    }
		  if (c > l)
		    {
		      c = cut;
		      over[refsp-1] = 1;
		    }
		  over[refsp-1] |= over[refsp];
		  nalpha[refsp-1] += nalpha[refsp];
		  rthis[refsp-1] = c;
		}
	      refsp--;
	    }
	}
      while (x);
    }
  if (refsp >= 0)
    log(L_ERROR, "Unbalanced reference brackets for card %08x", id);
}

#endif

static inline uns
xtoi(uns c)
{
  if (c >= '0' && c <= '9')
    return c - '0';
  else if (c >= 'a' && c <= 'f')
    return c - 'a' + 10;
  else if (c >= 'A' && c <= 'F')
    return c - 'A' + 10;
  else
    {
      ASSERT(0);
      return 0;
    }
}

static char *short_opts = CF_SHORT_OPTS "i:";
static char *help = "\
Usage: scanner [<options>]\n\
\n\
Options:\n"
CF_USAGE
"-i directory\t\tPath to the previous index for incremental scanner\n\
";

static void NONRET
usage(void)
{
  fputs(help, stderr);
  exit(1);
}

int
main(int argc, char **argv)
{
  uns id, id_new, count_in, count_bots, count_ok, count_new, count_err;
  struct fastbuf *fingerprints;
  struct fastbuf *labels_by_id;
  struct fastbuf *attributes;
  struct fastbuf *checksums;
  struct fastbuf *links;
  struct fastbuf *urls;
  struct fastbuf *merges;
  struct fastbuf *signatures;
  struct fastbuf *reftexts;
  struct fastbuf *notes, *notes_new;
  struct hash_permuter *permuter;

  log_init(argv[0]);
  setproctitle_init(argc, argv);
  int opt;
  byte *incr_dir = NULL;
  while ((opt = cf_getopt(argc, argv, short_opts, CF_NO_LONG_OPTS, NULL)) >= 0)
    switch (opt)
      {
	case 'i':
	  incr_dir = optarg;
	  break;
	default:
	  usage();
      }
  if (optind < argc)
    usage();

  matcher_init();
  scan_analyse_init();
  prepare_parameters();
  fingerprints = index_maybe_bopen(fn_fingerprints, O_CREAT | O_TRUNC | O_WRONLY);
  labels_by_id = index_maybe_bopen(fn_labels_by_id, O_CREAT | O_TRUNC | O_WRONLY);
  put_attr_set_type(BUCKET_TYPE_V33);
  attributes = index_bopen(fn_attributes, O_CREAT | O_TRUNC | O_WRONLY);
  checksums = index_maybe_bopen(fn_checksums, O_CREAT | O_TRUNC | O_WRONLY);
  links = index_maybe_bopen(fn_links, O_CREAT | O_TRUNC | O_WRONLY);
  urls = index_maybe_bopen(fn_urls, O_CREAT | O_TRUNC | O_WRONLY);
  merges = index_maybe_bopen(fn_merges, O_CREAT | O_TRUNC | O_WRONLY);
  if (matcher_signatures && index_name_defined(fn_signatures))
  {
    signatures = index_bopen(fn_signatures, O_CREAT | O_TRUNC | O_WRONLY);
    permuter = permuter_new();
    parameters.srand = time(NULL);
    parameters.last_download = 0;
  }
  else
    signatures = NULL, permuter = NULL;
  reftexts = index_maybe_bopen(fn_ref_texts, O_CREAT | O_TRUNC | O_WRONLY);
  notes = index_bopen(fn_notes, O_CREAT | O_TRUNC | O_WRONLY);
  notes_new = index_maybe_bopen(fn_notes_new, O_CREAT | O_TRUNC | O_WRONLY);

  struct scanner_note *incr = NULL;

  scanner_pool = mp_new(16384);
  struct bucket_source *src = get_buck_init();
  scan_filter_init();
  url_key_init();
  srand(parameters.srand);
  log(L_INFO, "Scanning objects");

  id = count_in = count_bots = count_ok = count_new = count_err = 0;
  id_new = FIRST_ID_NEW;
  for (; src->get_next(src, scanner_pool, ~0U) && count_in < max_num_objects; mp_flush(scanner_pool))
    {
      struct card_attr attr;
      struct card_note note;
      byte *url, *c;

      struct odes *o = src->o;
      url = obj_find_aval(o, 'U');
      if (!url)
	die("Object %x has no URL, probably broken bucket file", src->oid);
      PROGRESS(count_in, "scanner: %d objects -> %d cards (%d%%)", count_in, id, (int)((float)src->progress_current/src->progress_max*100));
      count_in++;
      if (obj_find_aval(o, 'r'))	/* Skip robots.txt buckets */
	{
	  count_bots++;
	  continue;
	}
      bzero(&attr, sizeof(attr));
      attr.card = src->oid;
      bzero(&note, sizeof(note));
      byte *footprint = obj_find_aval(o, 'O');
      if (footprint)
      {
	for (uns i=0; i<16; i++)
	  note.footprint[i] = xtoi(footprint[2*i]) << 4 | xtoi(footprint[2*i+1]);
      }
      scan_analyse(o);
      if (scan_filter(url, o))
	{
	  int ok = 1;
	  if (src->type > BUCKET_TYPE_PLAIN)
	    {
	      c = obj_find_aval(o, '!');
	      if (c && !strncmp(c, "2304", 4))		/* We accept documents forbidden by robots.txt */
		;
	      else if (!c || !obj_find_attr(o, 'D'))	/* Not yet gathered */
		{
		  count_new++;
		  ok = 0;
		}
	      else if (c[0] != '0')			/* Gathered with error */
		{
		  count_err++;
		  ok = 0;
		}
	    }
	  if (!ok)
	    {
	      if (notes_new)
		{
		  if (fingerprints)
		    gen_fingerprint(fingerprints, url, id_new, NULL);
		  note.weight_scanner = initial_weight(o);
		  bwrite(notes_new, &note, sizeof(note));
		  id_new++;
		}
	    }
	  else
	    {
	      count_ok++;
	      if (fingerprints)
		gen_fingerprint(fingerprints, url, id, &attr);
	      if (labels_by_id)
		gen_labels_by_id(labels_by_id, o, url, id, &note);
	      if (checksums && !incr)
		gen_checksums(checksums, o, id, &attr, &note);
	      if (links)
		gen_links(links, o, id, &note);
	      gen_attrs(o, &attr, &note, incr);
	      if (signatures && !incr)
	        gen_signatures(signatures, o, permuter, id);
	      if (reftexts)
		gen_ref_texts(reftexts, o, id);
	      if (urls)
		bputsn(urls, url);
	      if (merges)
		bputl(merges, ~0U);
	      bwrite(attributes, &attr, sizeof(attr));
	      bwrite(notes, &note, sizeof(note));
	      id++;
	    }
	}
    }

  src->cleanup(src);
  parameters.objects_in = count_ok + count_err + count_bots;
  params_save(&parameters);
  bclose(fingerprints);
  bclose(labels_by_id);
  bclose(attributes);
  bclose(links);
  bclose(urls);
  bclose(merges);
  bclose(reftexts);
  bclose(notes);
  bclose(notes_new);
  mp_delete(scanner_pool);

  bclose(checksums);
  bclose(signatures);

  log(L_INFO, "Scanned %d objects (%d ok, %d err, %d robots, %d new)", count_in, count_ok, count_err, count_bots, count_new);
  log(L_INFO, "Created %d cards and %d notes on new URL's", id, id_new - FIRST_ID_NEW);
  scan_analyse_end();
  return 0;
}
