/*
 *	Simple Random Sampling (select <k> lines from the input stream at random)
 *
 *	(c) 2004 Martin Mares <mj@ucw.cz>
 */

#include "sherlock/sherlock.h"
#include "lib/fastbuf.h"
#include "lib/heap.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

static uns K;
static uns out;

struct line {
  long key;
  byte *text;
};

#define HEAP_LESS(x,y) (x.key > y.key)
static struct line *heap;
static uns heap_size;
static uns cnt;

static inline void
add_line(byte *line, uns len)
{
  long key = random();

  if (cnt >= K)
    {
      if (heap[1].key < key)
	return;
      if (heap[1].key == key)
	{
	  /* Unfortunately we have found a collision and we need to keep both colliding items in the heap */
	  if (cnt >= heap_size)
	    {
	      heap_size = heap_size + 32;
	      heap = xrealloc(heap, (heap_size+1) * sizeof(struct line));
	    }
	}
      else
	{
	  HEAP_DELMIN(struct line, heap, cnt, HEAP_LESS, HEAP_SWAP);
	  xfree(heap[cnt+1].text);
	}
    }
  cnt++;
  heap[cnt].key = key;
  heap[cnt].text = xmalloc(len+1);
  memcpy(heap[cnt].text, line, len+1);
  HEAP_INSERT(struct line, heap, cnt, HEAP_LESS, HEAP_SWAP);
}

int
main(int argc, char **argv)
{
  if (argc != 2)
    {
      fprintf(stderr, "Usage: sample <sample-size>\n");
      return 1;
    }
  K = atol(argv[1]);
  if (K < 1)
    {
      fprintf(stderr, "Invalid sample size %s\n", argv[1]);
      return 1;
    }

  srandom(time(NULL) ^ getpid());

  heap_size = K + 10;
  heap = xmalloc((heap_size+1) * sizeof(struct line));
  struct fastbuf *f = bfdopen_shared(0, 16384);
  byte buf[1024], *eol;
  while (eol = bgets(f, buf, sizeof(buf)))
    add_line(buf, eol-buf);
  bclose(f);

  f = bfdopen_shared(1, 16384);
  for(;;)
    {
      uns cc = cnt;
      while (cc)
	HEAP_DELMIN(struct line, heap, cc, HEAP_LESS, HEAP_SWAP);
      cc = 1;
      while (heap[cc].key < heap[cnt].key || cc == cnt)
	{
	  bputsn(f, heap[cc++].text);
	  if (++out == K || cc > cnt)
	    goto done;
	}
      uns cnt0 = cnt;
      cnt = 0;
      while (cc <= cnt0)
	{
	  heap[++cnt].key = random();
	  heap[cnt].text = heap[cc++].text;
	}
      HEAP_INIT(struct line, heap, cnt, HEAP_LESS, HEAP_SWAP);
    }
 done:
  bclose(f);

  return 0;
}
