/*
 *	Sherlock Indexer -- Generate Index Reports
 *
 *	(c) 2003--2005 Martin Mares <mj@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/conf.h"
#include "lib/url.h"
#include "lib/fastbuf.h"
#include "lib/chartype.h"
#include "indexer/indexer.h"
#include "indexer/attrs.h"
#include "indexer/merges.h"
#include "lang/lang.h"

#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>

static byte *fn_class_log;
static uns class_threshold;
static uns filetype_stats;
static uns language_stats;
static uns domain_stats;

static struct cfitem reporter_config[] = {
  { "Reporter",		CT_SECTION,	NULL },
  { "ClassLog",		CT_STRING,	&fn_class_log },
  { "ClassThreshold",	CT_INT,		&class_threshold },
  { "FiletypeStats",	CT_INT,		&filetype_stats },
  { "LanguageStats",	CT_INT,		&language_stats },
  { "DomainStats",	CT_INT,		&domain_stats },
  { NULL,		CT_STOP,	NULL }
};

static void
report_merges(void)
{
  if (!fn_class_log)
    return;
  log(L_INFO, "Generating equivalence class log");

  /* Load merging array */
  merges_map(0);

  /* Calculate class sizes */
  u32 *sizes = xmalloc_zero(card_count*4);
  for (uns i=0; i<card_count; i++)
    if (!(bring_attr(i)->flags & CARD_FLAG_EMPTY) && merges[i] != ~0U)
      sizes[merges[i]]++;

  /* Log large classes */
  struct fastbuf *b = index_bopen(fn_class_log, O_WRONLY | O_CREAT | O_TRUNC);
  struct fastbuf *urls = index_bopen(fn_urls, O_RDONLY);
  uns this_url = 0;
  for (uns i=0; i<card_count; i++)
    if (!(bring_attr(i)->flags & CARD_FLAG_EMPTY) && merges[i] != ~0U && sizes[merges[i]] >= class_threshold)
      {
	byte buf[MAX_URL_SIZE+1];
	while (this_url <= i)
	  {
	    bgets(urls, buf, sizeof(buf));
	    this_url++;
	  }
	bprintf(b, "%d\t%08x\t", sizes[merges[i]], merges[i]);
	bputsn(b, buf);
      }
  bclose(urls);
  bclose(b);

  /* Clean up */
  xfree(sizes);
  merges_unmap();
}

#ifdef CONFIG_FILETYPE
static void
report_filetypes(void)
{
  if (!filetype_stats)
    return;

  uns ft_cnt[2][MAX_FILE_TYPES];
  bzero(ft_cnt, sizeof(ft_cnt));
  for (uns i=0; i<card_count; i++)
    {
      struct card_attr *ca = bring_attr(i);
      uns t = CA_GET_FILE_TYPE(ca);
      ft_cnt[0][t]++;
      if (!(ca->flags & (CARD_FLAG_DUP | CARD_FLAG_EMPTY)))
	ft_cnt[1][t]++;
    }

  uns bufsize = 32;
  for (uns t=0; t<MAX_FILE_TYPES; t++)
    bufsize += strlen(custom_file_type_names[t]) + 16;
  byte buf[bufsize];

  for (uns i=0; i<2; i++)
    {
      byte *p = buf + sprintf(buf, "Filetypes %s:", i ? "out" : "in");
      for (uns t=0; t<MAX_FILE_TYPES; t++)
	p += sprintf(p, " %s=%d", custom_file_type_names[t], ft_cnt[i][t]);
      log(L_INFO, "%s", buf);
    }
}

#else
static inline void
report_filetypes(void)
{
  if (filetype_stats)
    log(L_ERROR, "Filetype statistics requested, but Holmes was compiled without support for filetypes");
}
#endif

#ifdef CONFIG_LANG
static void
report_langs(void)
{
  if (!language_stats)
    return;

  uns lang_cnt[MAX_LANGUAGES+1];
  bzero(lang_cnt, sizeof(lang_cnt));
  for (uns i=0; i<card_count; i++)
    {
      struct card_attr *ca = bring_attr(i);
#ifdef CONFIG_FILETYPE
      uns t = CA_GET_FILE_TYPE(ca);
      if (!FILETYPE_IS_TEXT(t))
	continue;
#endif
      lang_cnt[CA_GET_FILE_LANG(ca)]++;
    }

  uns bufsize = 32;
  for (uns l=0; l<lang_count; l++)
    bufsize += strlen(lang_code_to_name(l)) + 16;
  byte buf[bufsize];

  byte *p = buf + sprintf(buf, "Languages:");
  for (uns l=0; l<lang_count; l++)
    p += sprintf(p, " %s=%d", lang_code_to_name(l), lang_cnt[l]);
  log(L_INFO, "%s", buf);
}
#else
static inline void
report_langs(void)
{
  if (language_stats)
    log(L_ERROR, "Language statistics requested, but Holmes was compiled without support for multiple languages");
}
#endif

struct domain {
  uns cnt;
  char dom[1];
};

static inline void
dom_init_data(struct domain *d)
{
  d->cnt = 0;
}

#define HASH_NODE struct domain
#define HASH_PREFIX(x) dom_##x
#define HASH_KEY_ENDSTRING dom
#define HASH_WANT_LOOKUP
#define HASH_GIVE_INIT_DATA
#define HASH_AUTO_POOL 4096
#include "lib/hashtable.h"

static void
report_domains(void)
{
  if (!domain_stats)
    return;

  struct fastbuf *b = bopen(index_name(fn_urls), O_RDONLY, indexer_fb_size);
  byte url[MAX_URL_SIZE], buf[MAX_URL_SIZE], buf2[MAX_URL_SIZE];
  uns numeric = 0;
  dom_init();
  while (bgets(b, url, sizeof(url)))
    {
      struct url ur;
      if (url_deescape(url, buf) || url_split(buf, &ur, buf2))
	ASSERT(0);
      byte *d = ur.host + strlen(ur.host);
      if (Cdigit(d[-1]))
	numeric++;
      else
	{
	  uns lev = domain_stats;
	  while (d > ur.host && lev)
	    if (*--d == '.')
	      lev--;
	  if (*d == '.')
	    d++;
	  struct domain *dom = dom_lookup(d);
	  dom->cnt++;
	}
    }
  bclose(b);

  HASH_FOR_ALL(dom, dom)
    {
      log(L_INFO, "Domain %s: %d", dom->dom, dom->cnt);
    }
  HASH_END_FOR;
  log(L_INFO, "Numeric addresses: %d", numeric);
}

int
main(int argc, char **argv)
{
  log_init(argv[0]);
  cf_register(reporter_config);
  if (cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL) >= 0 ||
      optind < argc)
  {
    fputs("This program supports only the following command-line arguments:\n" CF_USAGE, stderr);
    exit(1);
  }

  attrs_part_map(0);
  report_merges();
  report_filetypes();
  report_langs();
  report_domains();
  attrs_part_unmap();
  return 0;
}
