/*
 * An experiment with URLs compression
 *
 * (c) 2006 Pavel Charvat <pchar@ucw.cz>
 */

#undef LOCAL_DEBUG

#include "lib/lib.h"
#include "lib/fastbuf.h"
#include "lib/ff-binary.h"
#include "lib/url.h"
#include "lib/getopt.h"
#include "lib/lizard.h"
#include "lib/unicode.h"

/* Parameters */

#define PREFIX_COMPRESSION 1
#define SEPARATORS 2		/* 0=no separators, store lengths instead; 1=zero-terminated; 2=set last char's highest bit */
#define LIZARD_COMPRESSION 1
#define STORE_INDICES 0

static uns block_max_size = 1 * 1024 * 1024;	/* Maximum uncompressed size in bytes */
static uns block_max_count = 16 * 1024;		/* Maximum number of URLs in a single block */

/* Ideas:
 * - predefined suffices ("/robots.txt", "/index.html", ...)
 * - separate lizard compression for indices/lengths/suffices 
 */

struct url_key {
  u16 len;
  byte data[MAX_URL_SIZE];
};

static uns url_sort_data_size = 4;

static inline int
url_sort_compare(struct url_key *a, struct url_key *b)
{
  uns len = MIN(a->len, b->len);
  int result = memcmp(a->data, b->data, len);
  if (!result)
    COMPARE(a->len, b->len);
  return result;
}

static inline int
url_sort_fetch_key(struct fastbuf *f, struct url_key *k)
{
  int len = bgetw(f);
  if (len < 0)
    return 0;
  k->len = len;
  breadb(f, k->data, len);
  return 1;
}

static inline void
url_sort_copy_data(struct fastbuf *src, struct fastbuf *dest, struct url_key *k)
{
  bputw(dest, k->len);
  bwrite(dest, k->data, k->len);
  bbcopy(src, dest, url_sort_data_size);
}

static inline byte *
url_sort_fetch_item(struct fastbuf *f, struct url_key *k, byte *limit)
{
  byte *b = (void *)(k + 1);
  if (limit - b < (int)url_sort_data_size)
    return NULL;
  breadb(f, b, url_sort_data_size);
  return b + url_sort_data_size;
}

static inline void
url_sort_store_item(struct fastbuf *f, struct url_key *k)
{
  bputw(f, k->len);
  bwrite(f, k->data, k->len + url_sort_data_size);
}

#define SORT_PREFIX(x) url_sort_##x
#define SORT_KEY struct url_key
#define SORT_PRESORT
#define SORT_INPUT_FB
#define SORT_OUTPUT_FILE
#include "lib/sorter.h"

static void
reverse_host(byte *host)
{
  uns len = strlen(host);
  byte dest[MAX_URL_SIZE];
  byte *p = host + len - 1;
  byte *d = dest, *s;
  for (s = p; s >= host; s--)
    if (*s == '.')
      {
	if (d != dest)
	  *d++ = '.';
	memcpy(d, s + 1, p - s);
	d += p - s;
	p = s - 1;
      }
  if (d != dest)
    *d++ = '.';
  memcpy(d, s + 1, p - s);
  memcpy(host, dest, len);
}

static uns count;

static struct fastbuf *
load_urls(struct fastbuf *f)
{
  struct fastbuf *out = bopen_tmp(0x40000);
  byte buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
  struct url url;
  while (bgets(f, buf1, sizeof(buf1))) {
    if (url_split(buf1, &url, buf2))
      die("Unable to parse URL '%s'", buf1);
    reverse_host(url.host);
    url_pack(&url, buf3);
    uns len = strlen(buf3);
    bputw(out, len);
    bwrite(out, buf3, len);
    bputl(out, count);
    DBG("Loaded <%s>", buf3);
    count++;
  }
  log(L_INFO, "Loaded %u URLs", count);
  bclose(f);
  brewind(out);
  return out;
}

struct block_header {
  u32 count;		/* Number of URLs in the block */
#if !SEPARATORS || PREFIX_COMPRESSION
  u32 len_size;		/* Uncompressed prefixes/suffices lengths size in bytes */
#endif
  u32 str_size;		/* Uncompressed strings lengths in bytes + len_size */
#if LIZARD_COMPRESSION
  u32 compressed_size;	/* Compressed size in bytes */
#endif
  u16 first_url_len;
};

static struct block_header block_header;
static byte *block_buf, *block_str_buf, *block_len_buf, *block_index_buf, *block_compressed;
static byte *block_str_pos, *block_len_pos, *block_index_pos;
static uns block_count, block_size;
static uns block_last_len;
static byte block_first_url[MAX_URL_SIZE], block_last_url[MAX_URL_SIZE];
static struct fastbuf *block_headers_fb, *block_str_fb;
static uns blocks_count;

static inline void
block_compress_start_block(void)
{
  block_str_pos = block_str_buf;
  block_len_pos = block_len_buf;
  block_index_pos = block_index_buf;
  block_count = block_size = 0;
}

static void
block_compress_init(void)
{
  block_buf = xmalloc(block_max_size + (block_max_count + 1) * (2 * !SEPARATORS + 2 * PREFIX_COMPRESSION + 4 * STORE_INDICES) + LIZARD_NEEDS_CHARS);
#if LIZARD_COMPRESSION
  block_compressed = xmalloc(block_max_size * LIZARD_MAX_MULTIPLY + 1 + LIZARD_MAX_ADD);
#endif  
  block_index_buf = block_buf;
  block_len_buf = block_index_buf + (block_max_count + 1) * 4 * STORE_INDICES;
  block_str_buf = block_len_buf + (block_max_count + 1) * (2 * !SEPARATORS + 2 * PREFIX_COMPRESSION);
  block_compress_start_block();
}

static void
block_compress_cleanup(void)
{
  xfree(block_buf);
#if LIZARD_COMPRESSION
  xfree(block_compressed);
#endif
}

static void
block_compress_flush(void)
{
  if (!block_count)
    return;
  blocks_count++;
  DBG("Flush");
  block_header.count = block_count;
  byte *block_start = block_str_buf;
#if !SEPARATORS || PREFIX_COMPRESSION
  block_start -= block_header.len_size = block_len_pos - block_len_buf;
  memmove(block_start, block_len_buf, block_header.len_size);
#endif  
  block_header.str_size = block_str_pos - block_start;

#if STORE_INDICES
  block_start -= block_count * 4;
  memmove(block_start, block_index_buf, block_count * 4);
#endif

  uns size = block_str_pos - block_start;
#if LIZARD_COMPRESSION
  size = block_header.compressed_size = lizard_compress(block_start, size, block_compressed);
#else
  block_compressed = block_start;
#endif

  bwrite(block_headers_fb, &block_header, sizeof(block_header));
  bwrite(block_headers_fb, block_first_url, block_header.first_url_len);
  bwrite(block_str_fb, block_compressed, size);
  block_compress_start_block();
}

static void
block_compress_put(byte *buf, uns len, uns index UNUSED)
{
  if (block_count == block_max_count || len + 2 * !SEPARATORS + (SEPARATORS == 1) + 2 * PREFIX_COMPRESSION + 4 * STORE_INDICES + block_size > block_max_size)
    block_compress_flush();
  if (!block_count) {
    DBG("Starting new block <%.*s>", len, buf);
    memcpy(block_first_url, buf, len);
    memcpy(block_last_url, buf, len);
    block_header.first_url_len = block_last_len = len;
  }
  else {
    uns prefix = 0;
#if PREFIX_COMPRESSION    
    uns cmp_len = MIN(len - (SEPARATORS == 2), block_last_len);
    for (; prefix < cmp_len; prefix++)
      if (buf[prefix] != block_last_url[prefix])
        break;
#endif    
    memcpy(block_last_url + prefix, buf + prefix, len - prefix);
    block_last_len = len;
    byte *p = block_len_pos, *q = block_str_pos;
#if PREFIX_COMPRESSION
    block_len_pos = utf8_32_put(block_len_pos, prefix);
#endif
#if !SEPARATORS
    block_len_pos = utf8_32_put(block_len_pos, len - prefix);
#endif
#if STORE_INDICES
    PUT_U32(block_index_pos, index);
    block_index_pos += 4;
#endif
    DBG("Adding url <%.*s>+<%.*s>", prefix, buf, len - prefix, buf + prefix);
    memcpy(block_str_pos, buf + prefix, len - prefix);
    block_str_pos += len - prefix;
#if SEPARATORS == 1
    *block_str_pos++ = 0;
#elif SEPARATORS == 2
    ASSERT(len - prefix);
    block_str_pos[-1] |= 128;
#endif
    block_size += (block_str_pos - q) + (block_len_pos - p) + 4 * STORE_INDICES;
  }
  block_count++;
}

static void
save_urls(struct fastbuf *f)
{
  block_compress_init();
  uns len;
  byte url[MAX_URL_SIZE];
  block_headers_fb = bopen("compressed-urls-headers", O_WRONLY | O_CREAT | O_TRUNC, 1 << 16);
  block_str_fb = bopen("compressed-urls-data", O_WRONLY | O_CREAT | O_TRUNC, 1 << 20);
  count = 0;
  while ((int)(len = bgetw(f)) >= 0)
    {
      count++;
      breadb(f, url, len);
      url[len] = 0;
      block_compress_put(url, len, bgetl(f));
    }
  block_compress_flush();
  block_compress_cleanup();
  log(L_INFO, "Processed %u URLs and %u blocks", count, blocks_count);
  unsigned long long int headers_total = bfilesize(block_headers_fb), str_total = bfilesize(block_str_fb), total = headers_total + str_total;
  log(L_INFO, "Resulting size is %lluKB + %lluKB = %lluK (%.02fB per URL, %lluB per block)", headers_total / 1024, str_total / 1024, total / 1024,
    count ? (double)(total) / count : 0., blocks_count ? total / blocks_count : 0);
  log(L_INFO, "Compression factor is %.02f%%", bfilesize(f) ? 100. * total / bfilesize(f) : 0);
  bclose(block_headers_fb);
  bclose(block_str_fb);
  bclose(f);
}

int main(int argc, char **argv)
{
  log_init(argv[0]);

  int opt;
  while ((opt = cf_getopt(argc, argv, CF_SHORT_OPTS, CF_NO_LONG_OPTS, NULL)) >= 0)
    die("usage?");

#if 1
  log(L_INFO, "Loading URLs");
  struct fastbuf *in = bopen("urls", O_RDONLY, 4000000);
  struct fastbuf *preprocessed = load_urls(in);
  
  log(L_INFO, "Sorting");
  url_sort_sort(preprocessed, "urls-sorted");
#endif

  log(L_INFO, "Compressing URLs");
  struct fastbuf *sorted = bopen("urls-sorted", O_RDONLY, 4000000);
  save_urls(sorted);
  
  return 0;
}
